|
|
| Data Analyst Nanodegree, Udacity vamshi.krishna.prime@gmail.com |
|
from IPython.display import Image
Image("img/Metro Bike.jpg")
Image description: image of Metro Bike bicycle.
Import libraries¶===========================
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sb
from sqlalchemy import create_engine
%matplotlib inline
from matplotlib.lines import Line2D
Load Data¶=================
Flat File:Database:| Dataset | Available format | Description | Mode of access |
|---|---|---|---|
| bikeshare_clean | bikeshare_master.csv | A clean dateset in csv format | Load directly using read_csv method in pandas |
| bikeshare_clean | bikeshare_master.db | A relational database | Requires SQL query to gather data |
engine = create_engine('sqlite:///bikeshare_master.db')
# Import data from the database into a dataframe using SQL query
bikeshare = pd.read_sql('SELECT b.trip_id, \
b.bike_id, \
b.trip_type, \
b.bike_type, \
b.passholder_type AS pass_type, \
f.fare_type, \
t.start_time, \
t.end_time, \
t.duration AS duration_min, \
t.distance_miles, \
f.fare, \
s.start_station_id, \
s.start_lat, \
s.start_lon, \
s.end_station_id, \
s.end_lat, \
s.end_lon \
FROM bike AS b \
JOIN time AS t \
ON b.trip_id = t.trip_id \
JOIN fare AS f \
ON b.trip_id = f.trip_id \
JOIN station AS s \
ON t.trip_id = s.trip_id', engine)
Alternate approach is to load data from the flat file in CSV format.
bikeshare.info()
Not all columns retain their datatype information while retreving the dataset from the database. This is because of transition of data from one format/platform to another. The incorrect
column datatypesare to bemanually assigned.
level_order = ['One Way', 'Round Trip']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
bikeshare['trip_type'] = bikeshare['trip_type'].astype(ordered_cat)
level_order = ['unknown', 'Standard', 'Electric', 'Smart']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
bikeshare['bike_type'] = bikeshare['bike_type'].astype(ordered_cat)
level_order = ['Walk-up', 'One Day', 'Monthly', 'Flex', 'Annual']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
bikeshare['pass_type'] = bikeshare['pass_type'].astype(ordered_cat)
level_order = ['Base', 'Extended']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
bikeshare['fare_type'] = bikeshare['fare_type'].astype(ordered_cat)
bikeshare['start_time'] = pd.to_datetime(bikeshare['start_time'])
bikeshare['end_time'] = pd.to_datetime(bikeshare['end_time'])
bikeshare.info()
Expand the dataset by extracting timeline variables for further plotting
The time series data related to rentals
hour/day/week/month/yearneeds to be prepared/extracted for further plotting.
%%time
# create a timeline variables from the existing data
bikeshare['year'] = bikeshare['start_time'].dt.year
bikeshare['month'] = bikeshare['start_time'].dt.month
bikeshare['weekday'] = bikeshare['start_time'].dt.weekday
bikeshare['day'] = bikeshare['start_time'].dt.day
bikeshare['hour'] = bikeshare['start_time'].dt.hour
bikeshare[['year', 'month', 'weekday', 'day', 'hour']].head()
Extract daytime from the hour column:
Extract
day_sectionfromhourcolumn.
# divide the hour of the day into customized sections
bin = [-1,5,11,16,20,23]
bikeshare['day_sections'] = pd.cut(bikeshare['start_time'].dt.hour,bin)
bikeshare['day_sections'].head(10)
Explore the various methods to extract the
sections of the daybased on thehourof the day. To calculate the method with most performance (less time to extract the values), take the first 1000 entries in the dataset and calculate the execution time.
%%capture --no-stdout
def apply_section(row):
if row in df_new.day_sections.unique()[0] :
return 'Early hours'
if row in df_new.day_sections.unique()[1] :
return 'Morning'
if row in df_new.day_sections.unique()[2] :
return 'Afternoon'
if row in df_new.day_sections.unique()[3] :
return 'Evening'
if row in df_new.day_sections.unique()[4] :
return 'Night'
return 'unknown'
def map_identity(row):
if row in df_new.day_sections.unique()[0] :
return 'Early hours'
if row in df_new.day_sections.unique()[1] :
return 'Morning'
if row in df_new.day_sections.unique()[2] :
return 'Afternoon'
if row in df_new.day_sections.unique()[3] :
return 'Evening'
if row in df_new.day_sections.unique()[4] :
return 'Night'
return 'unknown'
def map_identity2(row):
if row == df_new.day_sections.unique()[0] :
return 'Early hours'
if row == df_new.day_sections.unique()[1] :
return 'Morning'
if row == df_new.day_sections.unique()[2] :
return 'Afternoon'
if row == df_new.day_sections.unique()[3] :
return 'Evening'
if row == df_new.day_sections.unique()[4] :
return 'Night'
return 'unknown'
def mask_section(df):
df['label4'] = df['day_sections'].mask(df.day_sections==df.day_sections.unique()[0], 'Early hours')
df['label4'] = df['day_sections'].mask(df.day_sections==df.day_sections.unique()[1], 'Morning')
df['label4'] = df['day_sections'].mask(df.day_sections==df.day_sections.unique()[2], 'Afternoon')
df['label4'] = df['day_sections'].mask(df.day_sections==df.day_sections.unique()[3], 'Evening')
df['label4'] = df['day_sections'].mask(df.day_sections==df.day_sections.unique()[4], 'Night')
def npwhere_section(df):
df['label5'] = np.where(df.day_sections == df.day_sections.unique()[0], 'Early hours', df.day_sections)
df['label5'] = np.where(df.day_sections == df.day_sections.unique()[1], 'Morning', df.label5)
df['label5'] = np.where(df.day_sections == df.day_sections.unique()[2], 'Afternoon', df.label5)
df['label5'] = np.where(df.day_sections == df.day_sections.unique()[3], 'Evening', df.label5)
df['label5'] = np.where(df.day_sections == df.day_sections.unique()[4], 'Night', df.label5)
def loc_section(df):
df.loc[df['day_sections'] == df.day_sections.unique()[0],'label6'] = 'Early hours'
df.loc[df['day_sections'] == df.day_sections.unique()[1],'label6'] = 'Morning'
df.loc[df['day_sections'] == df.day_sections.unique()[2],'label6'] = 'Afternoon'
df.loc[df['day_sections'] == df.day_sections.unique()[3],'label6'] = 'Evening'
df.loc[df['day_sections'] == df.day_sections.unique()[4],'label6'] = 'Night'
df_new = bikeshare.head(1000).copy()
%time df_new['label1'] = df_new['hour'].apply(lambda row: apply_section(row))
%time df_new['label2'] = df_new['hour'].map(map_identity)
%time df_new['label3'] = df_new['day_sections'].map(map_identity2)
%time mask_section(df_new)
%time npwhere_section(df_new)
%time loc_section(df_new)
From the above, it is evident that
np.where,mapmethod and.locmethod (vectorized operations) yields the most performance. However on larger datasets,.locmethod perform better.
from IPython.display import Image
Image("img/performance chart.PNG", width = 600, height = 300)
It can be determined from the above steps that
.locmethod is the best solution to add new column by extracting/comparing values from the existing column.
Extract daytime from day_section.
%%time
def assign_daytime(df):
df.loc[df['day_sections'] == df.day_sections.unique()[0],'daytime'] = 'Early hours'
df.loc[df['day_sections'] == df.day_sections.unique()[1],'daytime'] = 'Morning'
df.loc[df['day_sections'] == df.day_sections.unique()[2],'daytime'] = 'Afternoon'
df.loc[df['day_sections'] == df.day_sections.unique()[3],'daytime'] = 'Evening'
df.loc[df['day_sections'] == df.day_sections.unique()[4],'daytime'] = 'Night'
assign_daytime(bikeshare)
bikeshare.daytime.value_counts()
As estimated,
.locmethod exhibited the best perormance by extracting thedaytimevalues from theday_sectionscoulmns with808589entriesaround 1 second.
# display a sample of 'daytime' entries for visual confirmation
bikeshare[['day_sections', 'daytime']].sample(10)
Change weekday representation:
change the
weekdayrepresentation from numeric values to descriptive values. Aforementioned, use.locmethod to extract new column from the existing column values.
| Integer Value | Day of the week |
|---|---|
| 0 | Monday |
| 1 | Tuesday |
| 2 | Wednesday |
| 3 | Thursday |
| 4 | Friday |
| 5 | Saturday |
| 6 | Sunday |
%%time
def assign_weekday(df):
df.loc[df['weekday'] == 0,'weekday'] = 'Monday'
df.loc[df['weekday'] == 1,'weekday'] = 'Tuesday'
df.loc[df['weekday'] == 2,'weekday'] = 'Wednesday'
df.loc[df['weekday'] == 3,'weekday'] = 'Thursday'
df.loc[df['weekday'] == 4,'weekday'] = 'Friday'
df.loc[df['weekday'] == 5,'weekday'] = 'Saturday'
df.loc[df['weekday'] == 6,'weekday'] = 'Sunday'
assign_weekday(bikeshare)
# display a sample of 'daytime' entries for visual confirmation
bikeshare[['weekday']].sample(10)
Extract the relative number of the week in a month:
Each month bears either
3or4weeks depending on the leap year and month itself. Extract the relative number of the week in each month.
bin = [0,7,14,21,28,31]
#use pd.cut function can attribute the values into its specific bins
bikeshare['week_sections'] = pd.cut(bikeshare['day'],bin)
bikeshare[['week_sections']].head()
bikeshare.week_sections.unique()
%%time
def assign_week(df):
df.loc[df['week_sections'] == df.week_sections.unique()[0],'week'] = 'First'
df.loc[df['week_sections'] == df.week_sections.unique()[1],'week'] = 'Second'
df.loc[df['week_sections'] == df.week_sections.unique()[2],'week'] = 'Third'
df.loc[df['week_sections'] == df.week_sections.unique()[3],'week'] = 'Fourth'
df.loc[df['week_sections'] == df.week_sections.unique()[4],'week'] = 'Fifth'
assign_week(bikeshare)
bikeshare.week.value_counts()
bikeshare[['week_sections', 'week']].sample(10)
Extract quarter of the year from the month column:
Extract
quarter_sectionsfrommonthcolumn.
# divide the hour of the day into customized sections
bin = [0,3,6,9,12]
#use pd.cut function to attribute the values into its specific bins
bikeshare['quarter_sections'] = pd.cut(bikeshare['start_time'].dt.month,bin)
bikeshare['quarter_sections'].sample(10)
Extract
quarterfromquarter_sections.
bikeshare.quarter_sections.unique()
%%time
def extract_quarter(df):
df.loc[df['quarter_sections'] == df.quarter_sections.unique()[0],'quarter'] = 'Q1'
df.loc[df['quarter_sections'] == df.quarter_sections.unique()[1],'quarter'] = 'Q2'
df.loc[df['quarter_sections'] == df.quarter_sections.unique()[2],'quarter'] = 'Q3'
df.loc[df['quarter_sections'] == df.quarter_sections.unique()[3],'quarter'] = 'Q4'
extract_quarter(bikeshare)
bikeshare.quarter.value_counts()
As estimated,
.locmethod exhibited the best perormance by extracting thequarterof the year values from theyear_sectionscoulmns with808589entriesunder 1 second.
# display a sample of 'quarter' entries for visual confirmation
bikeshare[['quarter_sections', 'quarter']].sample(10)
Change datatypes of multiple columns to ordered categorical dtype:
bikeshare.info()
df = bikeshare
level_order = ['Early hours', 'Morning', 'Afternoon', 'Evening', 'Night']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['daytime'] = df['daytime'].astype(ordered_cat)
level_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['weekday'] = df['weekday'].astype(ordered_cat)
level_order = ['First', 'Second', 'Third', 'Fourth', 'Fifth']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['week'] = df['week'].astype(ordered_cat)
level_order = ['Q1', 'Q2', 'Q3', 'Q4']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['quarter'] = df['quarter'].astype(ordered_cat)
bikeshare.info()
Remove redundant columns in the dataset:
cols_to_drop = ['day_sections', 'week_sections', 'quarter_sections']
bikeshare.drop(cols_to_drop, axis=1, inplace=True)
for i, col in enumerate(bikeshare.columns):
print('{}'.format(i).ljust(2, " ") + ':' + '{}'.format(col))
Reorder columns in the dataset:
reorder columns as relevant/numerical data to the left most for visual analysis
reordered_columns = ['trip_id', 'bike_id', 'distance_miles', 'duration_min', 'fare',
'trip_type', 'bike_type', 'pass_type', 'fare_type', 'start_time',
'year', 'quarter', 'month', 'week', 'weekday', 'day', 'daytime','hour',
'end_time', 'start_station_id', 'start_lat', 'start_lon',
'end_station_id', 'end_lat', 'end_lon']
bikeshare = bikeshare.reindex(columns=reordered_columns)
for i, col in enumerate(bikeshare.columns):
print('{}'.format(i).ljust(2, " ") + ':' + ' {}'.format(col))
# display current palette
current_palette = sb.color_palette()
sb.palplot(current_palette)
plt.show()
# set the palette to support 'colorblind'
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = None)
current_palette = sb.color_palette()
sb.palplot(current_palette)
plt.show()
# visually confirm the palette change
current_palette = sb.color_palette()
sb.palplot(current_palette)
plt.show()
Bar Chart:
A
bar chartis used to depict the distribution of acategoricalvariable. In a bar chart, each level of the categorical variable is depicted with a bar, whose height indicates the frequency of data points that take on that level. A basic bar chart of frequencies can be created through the use of seaborn's countplot function:
Histogram:
A
histogramis used to plot the distribution of anumericalvariable. It is the quantitative version of the bar chart. However, rather than plot one bar for each unique numeric value, values are grouped intocontinuousbins, and one bar for each bin is plotted depicting the number.
Scatter plot:
A
scatter plotis used to inspect the relationship between two numeric variables. In a scatterplot, each data point is plotted individually as a point, its x-position corresponding to one feature value and its y-position corresponding to the second. One basic way of creating a scatterplot is through Matplotlib'sscatterfunction. An alternative approach is to use Seaborn'sregplotfunction which combines scatterplot creation with regression function fitting.
Facet Grid:
One general visualization technique that will be useful for you to know about to handle plots of two or more variables is
faceting. In faceting, the data is divided into disjoint subsets, most often by different levels of a categorical variable. For each of these subsets of the data, the same plot type is rendered on other variables. Faceting is a way of comparing distributions or relationships across levels of additional variables, especially when there are three or more variables of interest overall.
Clustered Bar chart:
To depict the relationship between two categorical variables, we can extend the univariate bar chart into a
clustered bar chart. In a clustered bar chart, bars are organized intoclustersbased on levels of thefirst variable, and then bars areorderedconsistently across thesecond variablewithin eachcluster.
Exploratory Data Analysis¶=========================================
trip_type column:¶Column: trip_typeData type: categorical data, nominalPlot : Bar chartBar Chart:
# Assign color palette as per requirement
sb.set_style("white")
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = None)
current_palette = sb.color_palette()
base_color = sb.color_palette()[0]
# prepare data for the plot
trip_type_order = bikeshare.trip_type.value_counts().index
max_count = bikeshare['trip_type'].value_counts().max()
tick_values = np.arange(0, max_count + 100000, 100000)
tick_names = ['{:0.1f} M'.format(v/1000000) for v in tick_values]
# Seaborn's countplot
sb.countplot(data = bikeshare, x = 'trip_type', color = base_color, alpha= 0.5, order = trip_type_order)
# improve plot aesthetics
plt.title('Distribution of bike rentals based on trip type\n', fontsize = 16, weight='bold')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Rentals (million)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(tick_values, tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
trip_type_counts = bikeshare['trip_type'].value_counts()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
try:
# get the text property for the label to get the correct count
count = trip_type_counts[label.get_text()]
except KeyError:
count = 0
pct_string = '{:0.0f}%'.format(100*count/n_points)
# print the annotation depending on the bar length
if count < (n_points/10):
plt.text(loc, count + (n_points/20), pct_string, ha = 'center', color = 'black', fontsize = 14)
else:
plt.text(loc, count - (n_points/10), pct_string, ha = 'center', color = 'black', fontsize = 14);
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.1 Distribution of bike rentals based on trip type.png', dpi=300, bbox_inches='tight')
Observation: The above plot depicts that bike rentals are more for One Way trips compared to Round Trip's.
bike_type column:¶Column: bike_typeData type: categorical data, nominalPlot : Bar chartBar Chart:
# Assign color palette as per requirement
sb.set_style('white')
base_color = sb.color_palette()[2]
# prepare data for the plot
bike_type_order = bikeshare.bike_type.value_counts().index
max_count = bikeshare['bike_type'].value_counts().max()
tick_values = np.arange(0, max_count + 100000, 100000)
tick_names = ['{:0.1f} M'.format(v/1000000) for v in tick_values]
# Seaborn's countplot
sb.countplot(data = bikeshare, x = 'bike_type', color = base_color, alpha= 0.5, order = bike_type_order)
# improve plot aesthetics
plt.title('Distribution of rentals based on bike type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Rentals (million)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(tick_values, tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
bike_type_counts = bikeshare['bike_type'].value_counts()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
try:
# get the text property for the label to get the correct count
count = bike_type_counts[label.get_text()]
except KeyError:
count = 0
pct_string = '{:0.0f}%'.format(100*count/n_points)
# print the annotation depending on the bar length
if count < (n_points/20):
plt.text(loc, count + (n_points/40), pct_string, ha = 'center', color = 'black', fontsize = 13)
else:
plt.text(loc, count - (n_points/20), pct_string, ha = 'center', color = 'black', fontsize = 13);
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.2 Distribution of bike rentals based on bike type.png', dpi=300, bbox_inches='tight')
Observation: The above plot depicts that standard bikes are in more demand compared to electric and smartbikes.
pass_type column:¶Column: pass_typeData type: categorical data, nominalPlot : Bar chart# display individual counts of rentals based on pass type
bikeshare['pass_type'].value_counts()
Bar Chart:
# Assign color palette as per requirement
sb.set_style("white")
base_color = sb.color_palette()[6]
# prepare data for the plot
pass_type_order = bikeshare.pass_type.value_counts().index
max_count = bikeshare['pass_type'].value_counts().max()
tick_values = np.arange(0, max_count + 100000, 100000)
tick_names = ['{:0.1f} M'.format(v/1000000) for v in tick_values]
# Seaborn's countplot
sb.countplot(data = bikeshare, x = 'pass_type', color = base_color, alpha= 1,
order = pass_type_order, saturation = 0.5)
# improve plot aesthetics
plt.title('Distribution of rentals based on customer pass\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Rentals (million)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(tick_values, tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
pass_type_counts = bikeshare['pass_type'].value_counts()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
try:
# get the text property for the label to get the correct count
count = pass_type_counts[label.get_text()]
except KeyError:
count = 0
pct_string = '{:0.0f}%'.format(100*count/n_points)
# print the annotation depending on the bar length
if count < (n_points/20):
plt.text(loc, count + (n_points/30), pct_string, ha = 'center', color = 'black', fontsize = 12)
else:
plt.text(loc, count - (n_points/25), pct_string, ha = 'center', color = 'black', fontsize = 12);
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.3 Distribution of bike rentals based on customer pass.png', dpi=300, bbox_inches='tight')
Observation: The above plot depicts that Monthly Pass is the most popular subscription among the customers.
fare_type column:¶Column: fare_typeData type: categorical data, nominalPlot : Bar chartBar Chart:
# Assign color palette as per requirement
sb.set_style("white")
base_color = sb.color_palette()[8]
# prepare data for the plot
fare_type_order = bikeshare.fare_type.value_counts().index
max_count = bikeshare['fare_type'].value_counts().max()
tick_values = np.arange(0, max_count + 100000, 100000)
tick_names = ['{:0.1f} M'.format(v/1000000) for v in tick_values]
# Seaborn's countplot
sb.countplot(data = bikeshare, x = 'fare_type', color = base_color, alpha= 0.6,
order = fare_type_order, saturation = 1)
# improve plot aesthetics
plt.title('Distribution of rentals based on fare type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Rentals (million)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(tick_values, tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
fare_type_counts = bikeshare['fare_type'].value_counts()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
try:
# get the text property for the label to get the correct count
count = fare_type_counts[label.get_text()]
except KeyError:
count = 0
pct_string = '{:0.0f}%'.format(100*count/n_points)
# print the annotation depending on the bar length
if count < (n_points/10):
plt.text(loc, count + (n_points/20), pct_string, ha = 'center', color = 'black', fontsize = 14)
else:
plt.text(loc, count - (n_points/10), pct_string, ha = 'center', color = 'black', fontsize = 14);
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.4 Distribution of bike rentals based on fare type.png', dpi=300, bbox_inches='tight')
Observation:
Base fare.Extended fares will result in decrease in income generation. As the percentage of Extended fares are less than 20%, some business reforms/promotional programs have to be taken to encourage customers to ride bikes for longer durations.Categorical parameters:¶Columns: trip_type, bike_type, pass_typeData type: categorical data, nominalPlot : Bar chartBar Chart:
def count_subplot(subplot, color, cat_type, alpha, sat):
# plot the distribution of bike rentals based on category types
#-----------------------Start of subplot-----------------------
# prepare the data for the plot
sb.set_style('darkgrid')
base_color = sb.color_palette()[color]
plt.subplot(1, 4, subplot)
max_count = bikeshare.shape[0]
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.1f} M'.format(v/1000000) for v in y_tick_values]
cat_order = bikeshare[cat_type].value_counts().index
# plot countplot
sb.countplot(data = bikeshare, x = cat_type, color = base_color, alpha= alpha, order = cat_order, saturation = sat)
# improve plot aesthetics
plt.title('Rentals based on {} type'.format(cat_type[0: 4].title()), fontsize = 16, weight = 'bold')
plt.xlabel('\n{} type'.format(cat_type[0: 4].title()), fontsize = 14)
plt.xticks(fontsize = 12)
if subplot == 1:
plt.ylabel('Rentals (million)\n', fontsize = 14)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
else:
plt.ylabel('')
plt.yticks(y_tick_values, [])
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
cat_type_counts = bikeshare[cat_type].value_counts()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
try:
# get the text property for the label to get the correct count
count = cat_type_counts[label.get_text()]
except KeyError:
count = 0
pct_string = '{:0.0f}%'.format(100*count/n_points)
# print the annotation depending on the bar length
if count < (n_points/10):
plt.text(loc, count + (n_points/25), pct_string, ha = 'center', color = 'black', fontsize = 13)
else:
plt.text(loc, count - (n_points/15), pct_string, ha = 'center', color = 'black', fontsize = 13);
# -------------------------------------------------------
#-------------------------End of subplot------------------------
plt.figure(figsize = [20, 6])
# plot syntax : count_subplot(subplot, color, cat_type, alpha, sat)
count_subplot(subplot=1, color=0, cat_type='trip_type', alpha=0.5, sat=1)
count_subplot(subplot=2, color=2, cat_type='bike_type', alpha=0.5, sat=1)
count_subplot(subplot=3, color=6, cat_type='pass_type', alpha=0.6, sat=0.8)
count_subplot(subplot=4, color=8, cat_type='fare_type', alpha=0.6, sat=1)
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.5 Comparision of bike rentals based on various categorical parameters.png', dpi=300, bbox_inches='tight')
Observation: The above plot depicts the classification of rentals based on various parameters. Among them it can be concluded that, most customers prefer standard bike over smart bikes, takes more One Way trips than Round Trip's, and prefers Monthly Pass over other subscriptions.
Column: hourData type: continuous dataPlot : Distribution plot, Line plotDistribution plot:
# Assign palette and grid as per requirement
sb.set_style('darkgrid')
plt.figure(figsize = [18, 6])
# prepare data for the plot
max_count = bikeshare.hour.value_counts().max()
x_tick_values = np.arange(0, 23+1, 1)
x_tick_names = ['{:}'.format(v) for v in x_tick_values]
y_tick_values = np.arange(0, max_count+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
bin_edges = np.arange(0, 23+1, 1)
# seaborn's distribution plot
sb.distplot(bikeshare.hour, bins = bin_edges, kde = False, color = 'lightskyblue',
hist_kws = {'alpha' : 1, 'edgecolor' : "white", 'linewidth' : 1})
# improve plot aesthetics
plt.title('Aggregated distribution of bike rentals based on hour of the day', fontsize = 16, weight = 'bold')
plt.xlabel('\nHour of the day', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(x_tick_values, x_tick_names, fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.6.a Aggregated distribution of bike rentals based on hour of the day.png', dpi=300, bbox_inches='tight')
Line plot:
plt.figure(figsize = [8, 6])
# Assign palette and grid as per requirement
sb.set_style('darkgrid')
# prepare data for the plot
x = bikeshare.groupby(bikeshare['hour']).count()['trip_id'].index
y = bikeshare.groupby(bikeshare['hour']).count()['trip_id'].values
x_tick_values = np.arange(0, 23+1, 1)
x_tick_names = ['{:}'.format(v) for v in x_tick_values]
y_tick_values = np.arange(0, bikeshare.hour.value_counts().max()+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# matplotlib's line plot
plt.plot(x, y, linewidth=2.0, color = 'lightskyblue')
# improve plot aesthetics
plt.title('Aggregated Hourly distribution of bike rentals', fontsize = 16, weight = 'bold')
plt.xlabel('\nHour of the day', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(x_tick_values, x_tick_names, fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# fill the area under the line
plt.fill_between(x, y, color = 'lightskyblue')
# draw the vertical axial line at the peak hour
peak_hour = bikeshare['hour'].value_counts(ascending=False).index[0]
plt.axvline(peak_hour, color='black', alpha=0.3, linewidth=2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.6.b Aggregated distribution of bike rentals based on hour of the day.png', dpi=300, bbox_inches='tight')
The above plots depict that the most busy hours are in the evenings and plots a vertical axial line that denotes the hour with maximum aggregated bike rentals over the hour of the day.
Find average rentals based on the hour of the day:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in any year. Care should be taken as to inlcude all hours in every day of the month. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"]]).count()['trip_id'].reset_index(name='rentals')
hours_df['rentals'] = hours_df['rentals'].fillna(0).astype(int)
hours_df.head(10)
Point plot:
# Assign color palette and figure size as per requirement
plt.figure(figsize=[12,4])
sb.set_style('whitegrid')
# Seaborn's point plot
sb.pointplot(data = hours_df, x = "hour", y = "rentals", linestyles = "-", color = 'lightskyblue')
# improve plot aesthetics
plt.title('Average bike rentals based on hour of the day\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. rentals\n', fontsize = 14)
plt.xlabel('\nHour of the day', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
sb.despine(top=True, right=True, left=True, bottom=True);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.6.c Average bike rentals based on hour of the day.png', dpi=300, bbox_inches='tight')
Add annotations to point out the busy hours of the day.
Point plot:
# Assign color palette and figure size as per requirement
plt.figure(figsize=[12,4])
sb.set_style('white')
# Seaborn's point plot
sb.pointplot(data = hours_df, x = "hour", y = "rentals", linestyles = "-", color = 'lightskyblue')
# improve plot aesthetics
plt.title('Average bike rentals based on hour of the day\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. rentals\n', fontsize = 14)
plt.xlabel('\nHour of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+10, 10)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
avg_rental_counts = hours_df.groupby([hours_df["hour"]]).mean()['rentals']
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*3)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f}'.format(count)
# print the annotation depending on the bar length
plt.text(loc, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.6.d Average bike rentals based on hour of the day.png', dpi=300, bbox_inches='tight')
Observations:
6:00 AM untill 5:00 PM with a peaks at 8:00 AM, 12:00 PM, and 5:00 PM, which are Morning office hours, Afternoon Lunch time, and Evening office relieve timings respectibely. This concludes that the huge portion of the customer database contain working individuals, who use bikes for the transportatioin.Column: daytimeData type: continuous dataPlot :Count plot:
# Assign color palette and figure size as per requirement
plt.figure(figsize = [6, 4])
sb.set(style="white")
# prepare the data for the plot
y_tick_values = np.arange(0, bikeshare['daytime'].value_counts().max()+100000, 100000)
y_tick_names = ['{:0.1f} M'.format(v/1000000) for v in y_tick_values]
# Seaborn's count plot
sb.countplot(x = 'daytime', data = bikeshare, color = 'lightskyblue')
# improve plot aesthetics
plt.title('Aggregated distribution of daytime bike rentals\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDaytime', fontsize = 14)
plt.ylabel('Rentals (million)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
daytime_counts = bikeshare['daytime'].value_counts()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
try:
count = daytime_counts[label.get_text()]
except KeyError:
count = 0
pct_string = '{:0.0f}%'.format(100*count/n_points)
# print the annotation depending on the bar length
if count < (n_points/10):
plt.text(loc, count + (n_points/50), pct_string, ha = 'center', color = 'black', fontsize = 12)
else:
plt.text(loc, count - (n_points/30), pct_string, ha = 'center', color = 'black', fontsize = 12)
# -------------------------------------------------------
# Create a legend:
# -------------------------------------------------------
daytime_order = ['Early hours', 'Morning', 'Afternoon', 'Evening', 'Night']
duration_order = ['[0, 5] - 6 hours', '(5, 11] - 6 hours', '(11, 16] - 5 hours', '(16, 20] - 4 hours', '(20, 23] - 3 hours']
indents = [16, 17, 16, 17, 19]
# Plot empty lists with the desired label
for time, duration, indent in zip(daytime_order, duration_order, indents):
plt.scatter([], [], c='k', alpha=0.3,
label= '{}'.format(time).ljust(indent, ' ') + ' : ' + '{}'.format(duration))
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.7), loc = 6, labelspacing=0.5,
title='Daytime : duration', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=0.5, handletextpad=0.5)
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.7.a Aggregated distribution of bike rentals based on time of the day.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
Create a dataset which contain bike rentals relative to time of the day over respective months in any year. Care should be taken as to inlcude all daytimes in every day of the month. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to each daytime in any day.
# create a dataset for bike rentals for each daytime of the day
daytime_df = bikeshare.groupby([bikeshare['year'],
bikeshare['month'],
bikeshare['day'],
bikeshare['daytime']]).count()['trip_id'].reset_index(name='rentals')
daytime_df['rentals'] = daytime_df['rentals'].fillna(0).astype(int)
daytime_df.head(10)
Point plot:
# Assign color palette and grid as per requirement
sb.set_style('white')
# Seaborn's point plot
sb.pointplot(data = daytime_df, x = "daytime", y = "rentals", linestyles = "-", color = 'lightskyblue')
# improve plot aesthetics
plt.title('Avg. bike rentals based on daytime\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. rentals\n', fontsize = 14)
plt.xlabel('\nDaytime', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+50, 50)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
cat_order = daytime_df.daytime.sort_values(ascending=True).unique()
avg_rental_counts = daytime_df.groupby([daytime_df["daytime"]]).mean()['rentals'][cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f}'.format(count)
# print the annotation depending on the bar length
plt.text(loc, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.7.b Average bike rentals based on time of the day.png', dpi=300, bbox_inches='tight')
Observations:
Afternoon, with Morining and Evening being closest. This denotes that the customers use bike rentals to avoid bright sun.Early Hours and Night times. Promoting fitness activities will increase rental activity during Early Hours. While tie up with night events will boost Night rentals.Column: dayData type: continuous dataPlot :Line plot:
# Assign figure size and color palette as per requirement
plt.figure(figsize = [8, 6])
sb.set_style('darkgrid')
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = None)
clr = sb.color_palette()[4]
# prepare data for the plot
x = bikeshare.groupby(bikeshare['day']).count()['trip_id'].index
y = bikeshare.groupby(bikeshare['day']).count()['trip_id'].values
day_index_max = bikeshare.day.sort_values(ascending=False).unique()[0]
x_tick_values = np.arange(1, day_index_max+1, 5)
x_tick_names = ['{:}'.format(v) for v in x_tick_values]
y_tick_values = np.arange(0, bikeshare.day.value_counts().max()+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# matplotlib's line plot
plt.plot(x, y, linewidth=2.0, color = clr)
# improve plot aestetics
plt.title('Aggregative distribution of daily bike rentals\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the month', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(x_tick_values, x_tick_names, fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.8.a Aggregated distribution of daily bike rentals.png', dpi=300, bbox_inches='tight')
Bar chart:
# Assign figure size and color palette as per requirement
plt.figure(figsize = [18, 6])
sb.set_style('darkgrid')
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = None)
clr = sb.color_palette()[4]
# prepare data for the plot
max_count = bikeshare.day.value_counts().max()
day_index_max = bikeshare.day.sort_values(ascending=False).unique()[0]
x_tick_values = np.arange(1, day_index_max+1, 1)
x_tick_names = ['{:}'.format(v) for v in x_tick_values]
y_tick_values = np.arange(0, max_count+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
bin_edges = np.arange(0.5, 31.5+1, 1)
# Seaborn's distribution plot
sb.distplot(bikeshare.day, bins = bin_edges, kde = False, color = clr,
hist_kws = {'alpha' : 0.8, 'edgecolor' : "white", 'linewidth' : 1})
# improve plot aesthetics
plt.title('Aggregative distribution of bike rentals based on day of the month', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the month', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(x_tick_values, x_tick_names, fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.8.b Aggregated distribution of bike rentals based on day of the month.png', dpi=300, bbox_inches='tight')
The above plots depicts that the day 31st has less aggregate bike rentals compared to other days in the month. Plot a barchart with annotations to potray the difference in bike rentals on day 31st, compared to other days in the month.
Bar chart:
# Assign figure size and color palette as per requirement
plt.figure(figsize = [18, 6])
sb.set_style('darkgrid')
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = None)
clr = sb.color_palette()[4]
# prepare data for the plot
day_index_max = bikeshare.day.sort_values(ascending=False).unique()[0]
daily_order = np.arange(1, day_index_max+1, 1)
max_count = bikeshare.day.value_counts().max()
min_count = bikeshare.day.value_counts().min()
tick_values = np.arange(0, max_count+10000, 10000)
tick_names = ['{:0.0f} K'.format(v/1000) for v in tick_values]
day_values = bikeshare.day.value_counts().values
clrs = ['thistle' if (x > min_count) else clr for x in day_values ]
# Seaborn's count plot
sb.countplot(data = bikeshare, x = 'day', palette=clrs,
alpha= 1, order = daily_order, saturation = 0.8)
# improve plot aesthetics
plt.title('Aggregative distribution of bike rentals based on day of the month', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the month', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(tick_values, tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
daily_counts = bikeshare.day.value_counts()
daily_max = daily_counts.max()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
try:
count = daily_counts[int(label.get_text())]
except KeyError:
count = 0
pct_string = '{:0.1f}%'.format(100*count/n_points)
# print the annotation depending on the bar length
if count < (daily_max/10):
plt.text(loc, count + (daily_max/40), pct_string, ha = 'center', color = 'black', fontsize = 12)
else:
plt.text(loc, count + (daily_max/40), pct_string, ha = 'center', color = 'black', fontsize = 12)
# -------------------------------------------------------
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.8.c Aggregated distribution of bike rentals based on day of the month.png', dpi=300, bbox_inches='tight')
The above plots depicts that the rentals decrease during the end of the month, especially on 31'st of the month. The bike rentals are categorized over the day of the month, the distribution is calculated based on the cummulative summation of day over 3 years but not individual month. Hence, there are only 21 occurances of day 31st, while other days has an occurance of 36 over the time period of 3 years (2017-2019), except for days 29 and 30 which counts to 33 due to absence in February month. This denotes that the rate of rentals is actually high on 31st compared to other days.
Find average rentals based on the day of the month:
Create a dataset which contain bike rentals relative to each day in the month over respective years. Care should be taken as not to inlcude the day 31st in every month of the year. Use only the unique appearences of categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals of day 31st on certain months only but not in every month.
# create a dataset for bike rentals over the days of the month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"]]).size().reset_index(name='rentals')
days_df.tail(10)
Check the appearances of individual days over the dataset created.
cat_order = days_df.day.sort_values(ascending=True).unique()
print('Month - Occurances')
days_df.day.value_counts()[cat_order]
The above cell depicts that the days 29, 30, and 31 has relatively less appearences compared to the other days in the month. This confirms the reliability of the dataset to calculate the average bike rentals based on day of the month.
Point plot:
# Assign grid and figure size as per requirement
plt.figure(figsize=[12,4])
sb.set_style('darkgrid')
# Seaborn's point plot
sb.pointplot(data = days_df, x = "day", y = "rentals", linestyles = "-", color = 'lightskyblue')
# improve plot aesthetics
plt.title('Avg. bike rentals based on day of the month\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the month', fontsize = 14)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.8.d Average bike rentals based on day of the month.png', dpi=300, bbox_inches='tight')
The above plot contains error distribution lines over standard deviation and is interferring with the interpretability of the plot. Hence remove the error distribution lines for clear plot aesthetics.
Point plot:
# Assign grid and figure size as per requirement
plt.figure(figsize=[12,4])
sb.set_style('whitegrid')
# Seaborn's point plot
sb.pointplot(data = days_df, x = "day", y = "rentals", linestyles = "-", color = 'lightskyblue', ci=None)
# improve plot aesthetics
plt.title('Avg. bike rentals based on day of the month\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. rentals\n', fontsize = 14)
plt.xlabel('\nDay of the month', fontsize = 14)
sb.despine(top=True, right=True, left=True, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.8.e Average bike rentals based on day of the month.png', dpi=300, bbox_inches='tight')
On contrary to the previous plots, the above plot depicts that the days subjected to end of the month have relatively high average bike rentals compared to most of the days in the month. However the above plot is not potrayed with respect to zero on the axis and amplifies the difference between the average rentals for any given day in the month. Re-plot the above graph with respect to zero over y-aixs.
Point plot:
# Assign grid and figure size as per requirement
plt.figure(figsize=[12,4])
sb.set_style('white')
# Seaborn's point plot
sb.pointplot(data = days_df, x = "day", y = "rentals", linestyles = "-", color = 'lightskyblue', ci=None)
# improve plot aesthetics
plt.title('Avg. bike rentals based on day of the month\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. rentals\n', fontsize = 14)
plt.xlabel('\nDay of the month', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# draw the vertical axial lines
plt.axhline(700, color='black', alpha=1, linewidth=0.5, linestyle='--')
plt.axhline(800, color='black', alpha=1, linewidth=0.5, linestyle='--')
sb.despine(top=True, right=True, left=True, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.8.f Average bike rentals based on day of the month.png', dpi=300, bbox_inches='tight')
The distribution of average bike rentals over the day of the month, ranges between 700 and 800 only. This depicts that there is no significant differance in average bike rentals subjected to any two days given in a month.
Observations:
700 and 800 only. This depicts that there is no significant differance in average bike rentals subjected to any two days given in a month.Column: weekdayData type: continuous dataPlot :Bar chart:
# Assign figure size and color palette as per requirement
plt.figure(figsize = [8, 6])
sb.set_style('white')
# prepare data for the plot
day_order = bikeshare.weekday.sort_values(ascending=True).unique()
max_count = bikeshare.weekday.value_counts().max()
y_tick_values = np.arange(0, max_count+25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# Seaborn's count plot
sb.countplot(data = bikeshare, x = 'weekday', color = 'cyan',
alpha= 0.5, order = day_order, saturation = 0.5)
# improve plot aesthetics
plt.title('Aggregated distribution of bike rentals over the weekday\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the week', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
day_counts = bikeshare.weekday.value_counts()
day_max = day_counts.max()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
# get the text property for the label to get the correct count
try:
count = day_counts[loc]
pct_string = '{:0.1f}%'.format(100*count/n_points)
except KeyError:
count = 15000
pct_string = '0%'
# print the annotation depending on the bar length
if count < (day_max/10):
plt.text(loc, count+(day_max/25), pct_string, ha = 'center', color = 'black', fontsize = 12)
else:
plt.text(loc, count-(day_max/15), pct_string, ha = 'center', color = 'black', fontsize = 12)
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.9.a Aggregated distribution of bike rentals over the week.png', dpi=300, bbox_inches='tight')
The above plots depicts that the day of the week does not have much impact on the rentals.
Find average rentals based on the weekday:
Create a dataset which contain bike rentals relative to day of the week over respective months in any year. Care should be taken as to inlcude all days in every week of the month. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to each day in any week.
# create a dataset for bike rentals over the days in a week
weekday_df = bikeshare.groupby([bikeshare['year'],
bikeshare['month'],
bikeshare['week'],
bikeshare['weekday']]).count()['trip_id'].reset_index(name='rentals')
weekday_df['rentals'] = weekday_df['rentals'].fillna(0).astype(int)
weekday_df.head(10)
Point plot:
# Assign teh palette and figure size as per requirement
plt.figure(figsize=[8,4])
sb.set_style('whitegrid')
flatui = ['cyan']
sb.set_palette(flatui, n_colors=1, desat=0.5)
# Seaborn's point plot
sb.pointplot(data = weekday_df, x = "weekday", y = "rentals", linestyles = "-",
color = sb.color_palette()[0], ci=None, alpha=0.5)
# improve plot aesthetics
plt.title('Avg. bike rentals based on day of the week\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the week', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
sb.despine(top=True, right=True, left=True, bottom=True);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.9.b Average bike rentals based on day of the week.png', dpi=300, bbox_inches='tight')
The above plot depicts that the Saturdays and Sundays have relatively low average bike rentals compared to other days in the week. This depicts that the part of customer database contain working employees that use bikes for a ride to work. However the above plot is not potrayed with respect to zero on the axis and amplifies the difference between the average rentals for any given day in the week. Re-plot the above graph with respect to zero over y-aixs.
Point plot:
# Assign palette and figure size as per requirement
plt.figure(figsize=[8,4])
sb.set_style('white')
flatui = ['cyan']
sb.set_palette(flatui, n_colors=1, desat=0.5)
base_color = sb.color_palette()[0]
# Seaborn's point plot
sb.pointplot(data = weekday_df, x = "weekday", y = "rentals", linestyles = "-", color = base_color)
# improve plot aesthetics
plt.title('Avg. bike rentals based on weekday\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. rentals\n', fontsize = 14)
plt.xlabel('\nWeekday', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
cat_order = weekday_df.weekday.sort_values(ascending=True).unique()
avg_rental_counts = weekday_df.groupby([weekday_df["weekday"]]).mean()['rentals'][cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*9)/10)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f}'.format(count)
# print the annotation depending on the bar length
plt.text(loc, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# draw the vertical axial lines
plt.axhline(600, color='grey', alpha=1, linewidth=0.5, linestyle='--')
plt.axhline(700, color='grey', alpha=1, linewidth=0.5, linestyle='--')
sb.despine(top=True, right=True, left=True, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.9.c Average bike rentals based on day of the week.png', dpi=300, bbox_inches='tight')
The above plot depicts the distribution of average bike rentals over the day of the week, mostly which ranges between 600 and 700. The yellow annotations represent the busy days of the week. This depicts that there is slight decrease in average bike rentals on the weekend (saturday, sunday) while Friday apeears to be most busiest day of the week.
Observations:
600 and 700. The yellow annotations represent the busy days of the week. This depicts that there is slight decrease in average bike rentals on the weekend (saturday, sunday) while Friday apeears to be most busiest day of the week.10K bike rides will potentially increase the bike rentals on the weekends, significantly.Column: weekData type: continuous dataPlot :Count plot:
# Assign figure size and grid as per requirement
plt.figure(figsize = [8, 6])
sb.set_style('white')
# prepare data for the plot
day_order = bikeshare.week.sort_values(ascending=True).unique()
max_count = bikeshare.week.value_counts().max()
y_tick_values = np.arange(0, max_count+25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# Seaborn's count plot
sb.countplot(data = bikeshare, x = 'week', color = 'cyan',
alpha= 0.5, order = day_order, saturation = 0.5)
# improve plot aestetics
plt.title('Aggregated distribution of bike rentals over the week of month\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nWeek of the month', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
day_counts = bikeshare.week.value_counts()
day_max = day_counts.max()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
# get the text property for the label to get the correct count
try:
count = day_counts[loc]
pct_string = '{:0.1f}%'.format(100*count/n_points)
except KeyError:
count = 15000
pct_string = '0%'
# print the annotation depending on the bar length
if count < (day_max/10):
plt.text(loc, count+(day_max/25), pct_string, ha = 'center', color = 'black', fontsize = 13)
else:
plt.text(loc, count-(day_max/15), pct_string, ha = 'center', color = 'black', fontsize = 13)
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.10.a Aggregated distribution of bike rentals over the week of the month.png', dpi=300, bbox_inches='tight')
The Fifth week has cummulative low bike rentals because of less number of days. The Fifth weeks has either 0 (February) or 2 or 3 days while the rest of the weeks has a constant 7 days. Hence ignoring the fifth week, other weeks have closer distribution and does not much impact on the bike rentals over the week of the month.
Find average rentals based on the weekday:
Create a dataset which contain bike rentals relative to week of the month over respective years. Care should be taken as to inlcude every week in the month. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to each week in any month.
# create a dataset for bike rentals over the week of the month
week_df = bikeshare.groupby([bikeshare['year'],
bikeshare['month'],
bikeshare['week']]).count()['trip_id'].reset_index(name='rentals')
week_df['rentals'] = week_df['rentals'].fillna(0).astype(int)
week_df.head(10)
Point plot:
# Assign figure size and color palette as per requirement
plt.figure(figsize=[8,4])
sb.set_style('whitegrid')
flatui = ['cyan']
sb.set_palette(flatui, n_colors=1, desat=0.5)
# Seaborn's point plot
sb.pointplot(data = week_df, x = "week", y = "rentals", linestyles = "-", color = sb.color_palette()[0])
# improve plot aesthetics
plt.title('Avg. bike rentals based on week of the month\n\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nWeek of the month', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+1000, 1000)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
cat_order = week_df.week.sort_values(ascending=True).unique()
avg_rental_counts = week_df.groupby([week_df["week"]]).mean()['rentals'][cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f}'.format(count)
# print the annotation depending on the bar length
plt.text(loc, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
sb.despine(top=True, right=True, left=True, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.10.b Average bike rentals over the week of the month.png', dpi=300, bbox_inches='tight')
The
Fifthweek has relatively low bike rentals because of less number of days. TheFifthweeks has either0(February) or2or3days while the rest of the weeks has a constant7 days.
Observations:
Column: monthData type: continuous dataPlot :Line plot:
# Assign figure size and grid as per requirement
sb.set_style('darkgrid')
plt.figure(figsize = [8, 6])
# prepare data for the plot
x = bikeshare.groupby(bikeshare['month']).count()['trip_id'].index
y = bikeshare.groupby(bikeshare['month']).count()['trip_id'].values
max_count = bikeshare.month.value_counts().max()
month_index_max = bikeshare.month.sort_values(ascending=False).unique()[0]
x_tick_values = np.arange(1, month_index_max+1, 1)
x_tick_names = ['{:}'.format(v) for v in x_tick_values]
y_tick_values = np.arange(0, max_count+10000, 10000)
y_tick_names = ['{:0.0f}'.format(v/1000) for v in y_tick_values]
# Matplotlibs's line plot
plt.plot(x, y, linewidth=2.0, color = 'slateblue', alpha = 0.8)
# improve plot aesthetics
plt.title('Aggregated distribution of monthly entries\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nMonth', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(x_tick_values, x_tick_names, fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.fill_between(x, y, color = 'slateblue', alpha = 0.8);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.11.a Aggregated distribution of monthly rentals.png', dpi=300, bbox_inches='tight')
Distribution plot:
# Assign figure size and grid as per requirement
sb.set_style('darkgrid')
plt.figure(figsize = [12, 6])
# prepare data for the plot
max_count = bikeshare.month.value_counts().max()
month_index_max = bikeshare.month.sort_values(ascending=False).unique()[0]
x_tick_values = np.arange(1, month_index_max+1, 1)
x_tick_names = ['{:}'.format(v) for v in x_tick_values]
y_tick_values = np.arange(0, max_count+10000, 10000)
y_tick_names = ['{:0.0f}'.format(v/1000) for v in y_tick_values]
bin_edges = np.arange(0.5, 12.5+1, 1)
# Seaborn's distribution plot
sb.distplot(bikeshare.month, bins = bin_edges, kde = False, color = 'slateblue',
hist_kws = {'alpha' : 0.8, 'edgecolor' : "white", 'linewidth' : 1})
# improve plot aesthetics
plt.title('Aggregated distribution of bike rentals based on the month\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nMonth of the year', fontsize = 14)
plt.ylabel('Bike Rentals (thousands)\n', fontsize = 14)
plt.xticks(x_tick_values, x_tick_names, fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.11.b Aggregated distribution of monthly rentals.png', dpi=300, bbox_inches='tight')
The above plot depicts that the month August has the most aggregated bike rentals while the month February has the least aggregated bike rentals. Plot a bar chart with annotations for a more clear interpretation.
count plot:
# Assign figure size and grid as per requirement
sb.set_style('white')
plt.figure(figsize = [12, 6])
# prepare data for the plot
month_index_max = bikeshare.month.sort_values(ascending=False).unique()[0]
monthly_order = np.arange(1, month_index_max+1, 1)
max_count = bikeshare.month.value_counts().max()
min_count = bikeshare.month.value_counts().min()
tick_values = np.arange(0, max_count+10000, 10000)
tick_names = ['{:0.0f} K'.format(v/1000) for v in tick_values]
values = bikeshare.month.value_counts().values
bin_edges = np.arange(0.5, 12.5+1, 1)
# Seaborn's count plot
sb.countplot(data = bikeshare, x = 'month', color = 'slateblue',
alpha= 0.8, order = monthly_order, saturation = 1)
# improve plot aesthetics
plt.title('Aggregated distribution of bike rentals over the month of the year\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nMonth', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(tick_values, tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
monthly_counts = bikeshare.month.value_counts()
monthly_max = bikeshare.month.value_counts().max()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
# get the text property for the label to get the correct count
try:
count = monthly_counts[int(label.get_text())]
pct_string = '{:0.1f}%'.format(100*count/n_points)
except KeyError:
count = 10000
pct_string = '0%'
# print the annotation depending on the bar length
if count < (monthly_max/10):
plt.text(loc, count+(monthly_max/25), pct_string, ha = 'center', color = 'white', weight = 'bold', fontsize = 14)
else:
plt.text(loc, count-(monthly_max/15), pct_string, ha = 'center', color = 'white', weight = 'bold', fontsize = 14)
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.11.c Aggregated distribution of bike rentals over the month of the year.png', dpi=300, bbox_inches='tight')
The above plot depicts that the months August, September, and October has the most aggregated bike rentals while the months January, and February has the least aggregated bike rentals.
Find average rentals based on month of the year:
Create a dataset which contain bike rentals relative to month of the year. Care should be taken as to inlcude every month in the respective year. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to each month in any year.
# create a dataset for bike rentals over the month of the year
month_df = bikeshare.groupby([bikeshare['year'],
bikeshare['month']]).count()['trip_id'].reset_index(name='rentals')
month_df['rentals'] = month_df['rentals'].fillna(0).astype(int)
month_df.head(10)
Point plot:
# Assign grid and figure size as per requirement
plt.figure(figsize=[8,4])
sb.set_style('whitegrid')
flatui = ['slateblue']
sb.set_palette(flatui, n_colors=1, desat=0.8)
# Seaborn's point plot
sb.pointplot(data = month_df, x = "month", y = "rentals", linestyles = "-",
color = sb.color_palette()[0], alpha=0.5)
# improve plot aesthetics
plt.title('Avg. bike rentals based on month of the year\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nMonth of the year', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
sb.despine(top=True, right=True, left=True, bottom=True);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.11.d Average bike rentals based on month of the year.png', dpi=300, bbox_inches='tight')
The above plot contains error distribution lines over standard deviation and is interferring with the interpretability of the plot. Hence remove the error distribution lines for clear plot aesthetics. Also, the above plot is not potrayed with respect to zero on the axis and amplifies the difference between the average rentals for the given month in any year. Re-plot the above graph with respect to zero over y-aixs.
Point plot:
# Assign grid and figure size as per requirement
plt.figure(figsize=[8,4])
sb.set_style('white')
flatui = ['slateblue']
sb.set_palette(flatui, n_colors=1, desat=0.8)
# Seaborn's point plot
sb.pointplot(data = month_df, x = "month", y = "rentals", linestyles = "-",
color = sb.color_palette()[0], ci=None)
# improve plot aesthetics
plt.title('Avg. bike rentals based on month of the year\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals (thousands)\n', fontsize = 14)
plt.xlabel('\nMonth of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+5000, 5000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
cat_order = month_df.month.sort_values(ascending=True).unique()
avg_rental_counts = month_df.groupby([month_df["month"]]).mean()['rentals'][cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} K'.format(count/1000)
# print the annotation depending on the bar length
plt.text(loc, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.11.e Average bike rentals based on month of the year.png', dpi=300, bbox_inches='tight')
The above plot depicts the distribution of average bike rentals over the month of the year. The yellow annotations represent the busy months in the year.
Observations:
Column: quarterData type: continuous dataPlot :Count plot:
# Assign grid and figure size as per requirement
plt.figure(figsize = [6, 6])
sb.set(style="white")
# prepare the data for the plot
quarter_index_max = bikeshare.quarter.sort_values(ascending=True).unique()[0]
quarter_max = bikeshare.quarter.value_counts().max()
y_tick_values = np.arange(0, quarter_max+100000, 100000)
y_tick_names = ['{:0.1f} M'.format(v/1000000) for v in y_tick_values]
# Seaborn's count plot
sb.countplot(data = bikeshare, x = 'quarter', color = '#47b1c9', alpha = 0.8, saturation = 0.8)
# improve plot aesthetics
plt.title('Aggregated distribution of quarterly bike rentals\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nQuarter of the year', fontsize = 14)
plt.ylabel('Bike Rentals (million)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
quarter_counts = bikeshare.quarter.value_counts()
quarter_max = quarter_counts.max()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
# get the text property for the label to get the correct count
try:
count = quarter_counts[label.get_text()]
pct_string = '{:0.0f}%'.format(100*count/n_points)
except KeyError:
count = 10000
pct_string = '0%'
# print the annotation depending on the bar length
if count < (quarter_max/10):
plt.text(loc, count+(quarter_max/25), pct_string, ha = 'center', color = 'white', weight = 'bold', fontsize = 14)
else:
plt.text(loc, count-(quarter_max/15), pct_string, ha = 'center', color = 'white', weight = 'bold', fontsize = 14)
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.12.a Aggregated distribution of bike rentals based on the Quarter of the year.png', dpi=300, bbox_inches='tight')
The above plot depicts that the third quarer of the year has the highest aggregated bike rentals, while the first quarter has the lowest.
Find average rentals based on quarter of the year:
Create a dataset which contain bike rentals relative to quarter of the year. Care should be taken as to inlcude every quarter in the respective year. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to each quarter in any year.
# create a dataset for bike rentals over the quarter of the year
quarter_df = bikeshare.groupby([bikeshare['year'],
bikeshare['quarter']]).count()['trip_id'].reset_index(name='rentals')
quarter_df['rentals'] = quarter_df['rentals'].fillna(0).astype(int)
quarter_df.head(10)
Point plot:
# Assign grid and figure size as per requirement
plt.figure(figsize=[8,4])
sb.set_style('whitegrid')
flatui = ['#47b1c9']
sb.set_palette(flatui, n_colors=1, desat=0.8)
# Seaborn's point plot
sb.pointplot(data = quarter_df, x = "quarter", y = "rentals", linestyles = "-",
color = sb.color_palette()[0], alpha=0.5)
# improve plot aesthetics
plt.title('Avg. bike rentals based on quarter of the year\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. rentals\n', fontsize = 14)
plt.xlabel('\nQuarter of the year', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
sb.despine(top=True, right=True, left=True, bottom=True);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.12.b Average bike rentals based on quarter of the year.png', dpi=300, bbox_inches='tight')
The above plot contains error distribution lines over standard deviation and is interferring with the interpretability of the plot. Hence remove the error distribution lines for clear plot aesthetics. Also the above plot is not potrayed with respect to zero on the axis and amplifies the difference between the average rentals for any given quarter in the year. Re-plot the above graph with respect to zero over y-aixs.
Point plot:
# Assign grid and figure size as per requirement
plt.figure(figsize=[6,4])
sb.set_style('white')
flatui = ['#47b1c9']
sb.set_palette(flatui, n_colors=1, desat=1)
# Seaborn's point plot
sb.pointplot(data = quarter_df, x = "quarter", y = "rentals", linestyles = "-",
color = sb.color_palette()[0], ci=None)
# improve plot aesthetics
plt.title('Avg. bike rentals based on quarter of the year\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. rentals (thousands)\n', fontsize = 14)
plt.xlabel('\nQuarter of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
cat_order = quarter_df.quarter.sort_values(ascending=True).unique()
avg_rental_counts = quarter_df.groupby([quarter_df["quarter"]]).mean()['rentals'][cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} K'.format(count/1000)
# print the annotation depending on the bar length
plt.text(loc, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.12.c Average bike rentals based on quarter of the year.png', dpi=300, bbox_inches='tight')
The above plot depicts the distribution of average bike rentals over the quarter of the year. The yellow annotations represent the busiest quarters in the year. It appears that the third and fourth quarters has the highest average bike rentals over the year.
Observations: The above plot depicts that the first quarter has the least rental activity while third and fourth quarter see highest rental activity.
Column: yearData type: continuous dataPlot :Count plot:
# Assign grid and figure size as per requirement
sb.set_style('white')
plt.figure(figsize = [6, 6])
sb.set_palette('deep', n_colors=10, desat=0.5)
base_color = sb.color_palette()[3]
# prepare data for the plot
yearly_order = bikeshare.year.sort_values(ascending=True).unique()
max_count = bikeshare.year.value_counts().max()
min_count = bikeshare.year.value_counts().min()
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.1f} M'.format(v/1000000) for v in y_tick_values]
# Seaborn's count plot
sb.countplot(data = bikeshare, x = 'year', color = base_color,
alpha= 0.8, order = yearly_order, saturation = 1)
# improve plot aesthetics
plt.title('Distribution of yearly bike rentals\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nYear', fontsize = 14)
plt.ylabel('Bike Rentals (millions)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
n_points = bikeshare.shape[0]
yearly_counts = bikeshare.year.value_counts()
year_max = yearly_counts.max()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
# get the text property for the label to get the correct count
try:
count = yearly_counts[int(label.get_text())]
pct_string = '{:0.0f}%'.format(100*count/n_points)
except KeyError:
count = 10000
pct_string = '0%'
# print the annotation depending on the bar length
if count < (year_max/10):
plt.text(loc, count+(year_max/25), pct_string, ha = 'center', color = 'white', weight = 'bold', fontsize = 14)
else:
plt.text(loc, count-(year_max/10), pct_string, ha = 'center', color = 'white', weight = 'bold', fontsize = 14)
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.13.a Distribution of yearly rentals.png', dpi=300, bbox_inches='tight')
Find average rentals based on the year:
Create a dataset which contain bike rentals relative to each year. Care should be taken as to inlcude every year. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to each year.
# create a dataset for bike rentals over the years
year_df = bikeshare.groupby([bikeshare['year']]).count()['trip_id'].reset_index(name='rentals')
year_df['rentals'] = year_df['rentals'].fillna(0).astype(int)
year_df
Point plot:
# Assign grid and figure size as per requirement
plt.figure(figsize=[8,4])
sb.set_style('whitegrid')
sb.set_palette('deep', n_colors=10, desat=0.5)
base_color = sb.color_palette()[3]
# Seaborn's point plot
sb.pointplot(data = year_df, x = "year", y = "rentals", linestyles = "-",
color = base_color, alpha=0.5)
# improve plot aesthetics
plt.title('Avg. bike rentals based on the year\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nYear', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and change them to the count of thousands
y_locs, y_labels = plt.yticks()
y_tick_names = ['{:0.0f} K'.format(y_loc/1000) for y_loc in y_locs]
plt.yticks(y_locs, y_tick_names, fontsize = 12)
sb.despine(top=True, right=True, left=True, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.13.b Average bike rentals based on the year.png', dpi=300, bbox_inches='tight')
The above plot is not potrayed with respect to zero on the axis and amplifies the difference between the average rentals for any given year. Re-plot the above graph with respect to zero over y-aixs.
Point plot:
# Assign grid and figure size as per requirement
plt.figure(figsize=[6,4])
sb.set_style('white')
sb.set_palette('deep', n_colors=10, desat=0.5)
base_color = sb.color_palette()[3]
# Seaborn's point plot
sb.pointplot(data = year_df, x = "year", y = "rentals", linestyles = "-",
color = base_color, ci=None)
# improve plot aesthetics
plt.title('Avg. bike rentals based on the year\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals (thousands)\n', fontsize = 14)
plt.xlabel('\nYear', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
cat_order = year_df.year.sort_values(ascending=True).unique()
avg_rental_counts = year_df.groupby([year_df["year"]]).mean()['rentals'][cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} K'.format(count/1000)
# print the annotation depending on the bar length
plt.text(loc, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.13.c Average bike rentals based on the year.png', dpi=300, bbox_inches='tight')
Observations:
yellow annotations represent the busiest ones. It appears that the founding year 2017 has relatively low average bike rentals while 2018 has the highest average bike rentals. However there is a slight decrease in average rentals in the following year of 2019, which is not a good sign of business and potrays the requirement of better business plans.Column: year and monthData type: continuous dataPlot :Timeseries plot:
Plot the continuous timeline distribution of bike rentals over the years 2107, 2018, and 2019.
# display the uique categorical dates
bikeshare.groupby([bikeshare["year"],
bikeshare["month"]]).count().index.sort_values().unique()
# change the format of the dates for the plot
new_dates = []
dates = bikeshare.groupby([bikeshare["year"],
bikeshare["month"]]).count().index.sort_values().unique()
for date in dates:
new_dates.append(str(date[0])+'-'+str(date[1]))
new_dates
# Assign grid and figure size as per requirement
sb.set_style('darkgrid')
plt.figure(figsize = [20, 6])
sb.set_palette('deep', n_colors=10, desat=0.8)
base_color = sb.color_palette()[3]
# prepare data for the plot
new_dates = []
dates = bikeshare.groupby([bikeshare["year"],
bikeshare["month"]]).count().index.sort_values().unique()
for date in dates:
new_dates.append(str(date[0])+'-'+str(date[1]))
x = new_dates
y = bikeshare.groupby([bikeshare["year"],
bikeshare["month"]]).count()['trip_id'].values
y_tick_values = np.arange(0, y.max()+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# Matplotlib's line plot
plt.plot(x, y, linewidth=2.0, color = base_color, alpha = 0.6)
# improve plot aesthetics
plt.title('Timeline distribution of bike rentals\n', fontsize = 18, weight = 'bold')
plt.xlabel('\n(Year - Month)', fontsize = 16)
plt.ylabel('Bike Rentals (thousands)\n', fontsize = 16)
plt.xticks(rotation = 50, fontsize = 14)
plt.yticks(y_tick_values, y_tick_names, fontsize = 14)
plt.fill_between(x, y, color = base_color, alpha = 0.6);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.14 Timeseries distribution of bike rentals based on year and month.png', dpi=300, bbox_inches='tight')
Observations: The bike rental activiy always decreases at the start of every year and slowly increases towards the end of the respective year.
end_time¶Column: end_timeData type: Categorical, nominalPlot : HistogramLine plot:
plt.figure(figsize = [8, 6])
sb.set_style('darkgrid')
x1 = bikeshare.groupby(bikeshare['start_time'].dt.hour).count()['trip_id'].index
y1 = bikeshare.groupby(bikeshare['start_time'].dt.hour).count()['trip_id'].values
x2 = bikeshare.groupby(bikeshare['end_time'].dt.hour).count()['trip_id'].index
y2 = bikeshare.groupby(bikeshare['end_time'].dt.hour).count()['trip_id'].values
x_tick_values = np.arange(0, 23+1, 1)
x_tick_names = ['{:}'.format(v) for v in x_tick_values]
y_tick_values = np.arange(0, bikeshare.start_time.dt.hour.value_counts().max()+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.plot(x1, y1, linewidth=2.0, color = 'lightskyblue', alpha = 0.5)
plt.plot(x2, y2, linewidth=2.0, color = 'orange', alpha = 0.5)
plt.title('Distribution of hourly entries\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nHour of the day', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(x_tick_values, x_tick_names, fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.fill_between(x1, y1, color = 'lightskyblue', alpha = 0.5)
plt.fill_between(x2, y2, color = 'orange', alpha = 0.5)
# draw the vertical axial line at the peak hour
start_peak_hour = bikeshare['start_time'].dt.hour.value_counts(ascending=False).index[0]
plt.axvline(start_peak_hour, color='black', alpha=0.3, linewidth=2)
end_peak_hour = bikeshare['end_time'].dt.hour.value_counts(ascending=False).index[0]
plt.axvline(end_peak_hour, color='pink', alpha=0.3, linewidth=2);
# add custom legend
custom_lines = [Line2D([0], [0], color= 'lightskyblue', lw=2),
Line2D([0], [0], color= 'orange', lw=2)]
plt.legend(custom_lines, ['Rentals', 'Returns'], scatterpoints=1, frameon=True, fancybox=True, shadow=False,
ncol = 1, framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Based on time', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.15 Comparision of hourly start time bike rentals and end time returns.png', dpi=300, bbox_inches='tight')
The duration of the bike rentals ranges between minutes to hours. Hence the bike return charts are no different from rentals when estimated on timelines other than hours. Hence there is no need to analyze the distribution of bike returns over timeline.
start_station_id¶Column: start_station_idData type: Categorical, nominalPlot : HistogramShould the rentals be calculated based on start_staion_id or the combination of start_lat and start_lon?
Exploration for the possible solution/approach to obtaing the rental traffic:
bikeshare.groupby([bikeshare['start_station_id'],
bikeshare['start_lat'],
bikeshare['start_lon']]).size().head(20)
From the above data it is evident that the start_station has more than one combination of start_lat and start_lon.This is because of the geographical extension of the start_station over the zone. Hence the rentals are to be calculated over the start_station_id but not the combination of start_lat and start_lon.
Distribution of rental traffic based on start_stations.
# find the rentals based on start_station_id
start_stations = bikeshare.groupby([bikeshare['start_station_id']]).size().reset_index(name='rentals')
start_stations.head()
# calculate the descriptive statistics of the rentals
start_stations['rentals'].describe()
Plot the distribution of start stations rental traffic:
sb.set_style('white')
# Seaborn's distribution plot
sb.distplot(start_stations['rentals'], kde = False, hist_kws = {'alpha' : 1}, color = 'darkturquoise')
# improve plot aesthetics
plt.title('start stations - bike rental traffic', fontsize = 14, weight = 'bold')
plt.xlabel('Bike rentals', fontsize = 12)
plt.ylabel('Station count', fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.16.a Distribution of start stations bike rental traffic.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto the start station's bike rental data.
Plot the Logarithmic distribution of start stations bike rentals:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
sb.set_style('white')
# prepare the data for the plot
min_value = log_trans(start_stations['rentals'].min())
max_value = log_trans(start_stations['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
# matplotlib's histogram
plt.hist(start_stations['rentals'].apply(log_trans), bins = bin_edges, color = 'darkturquoise')
# improve plot aesthetics
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int))
plt.title('Logarithmic distribution of start stations rentals', fontsize = 14, weight = 'bold')
plt.xlabel('Number of bike rentals', fontsize = 12)
plt.ylabel('Number of Stations', fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.16.b Distribution of start stations bike rental traffic.png', dpi=300, bbox_inches='tight')
Classification of start_stations based on their rental traffic:
# display the count of start stations based on their bike rental traffic
print('Total number of start stations'.ljust(35, ' '), ':', start_stations.shape[0], '\n')
print('Very Low traffic start stations'.ljust(35, ' '), ':', start_stations.query(' rentals < 10 ').shape[0])
print('Low traffic start stations'.ljust(35, ' '), ':', start_stations.query(' rentals >= 10 and rentals < 100 ').shape[0])
print('Normal traffic start stations'.ljust(35, ' '), ':', start_stations.query(' rentals >= 100 and rentals < 1000 ').shape[0])
print('High traffic start stations'.ljust(35, ' '), ':', start_stations.query(' rentals >= 1000 and rentals < 10000 ').shape[0])
print('Very High traffic start stations'.ljust(35, ' '), ':', start_stations.query(' rentals >= 10000 ').shape[0])
Create a dataframe based on bike rentals traffic and number of start stations associated with them.
rentals = {'rental_traffic' : pd.Series(['Very Low', 'Low', 'Normal', 'High', 'Very High']),
'start_stations' : pd.Series([start_stations.query(' rentals < 10 ').shape[0],
start_stations.query(' rentals >= 10 and rentals < 100 ').shape[0],
start_stations.query(' rentals >= 100 and rentals < 1000 ').shape[0],
start_stations.query(' rentals >= 1000 and rentals < 10000 ').shape[0],
start_stations.query(' rentals >= 10000 ').shape[0]])}
# create Dataframe.
bike_rentals = pd.DataFrame(rentals)
bike_rentals
Pie chart:
def absolute_value(val):
a = np.round(val/100.*type_level_counts.sum(), 0)
return int(a)
# prepare the data for the plot
type_level_counts = bike_rentals.start_stations.values
type_level_index = bike_rentals.rental_traffic.values
explode = (0.2, 0, 0, 0, 0)
colors = ['paleturquoise', 'darkturquoise', 'darkturquoise', 'darkturquoise', 'darkturquoise']
# matplotlib's pie chart
plt.pie(type_level_counts, labels = type_level_index, startangle = 90,
counterclock = False, wedgeprops = {'width' : 0.4}, shadow=False,
explode=explode, colors=colors, textprops={'fontsize': 14},
autopct=absolute_value, labeldistance=1.1, pctdistance=0.8)
plt.title('Start stations based on rental traffic\n\n', fontsize = 14, weight = 'bold')
plt.axis('square');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.16.c Classification of start stations based on bike rental traffic.png', dpi=300, bbox_inches='tight')
Bar chart:
# Assign grid and color palette as per requirement
sb.set_style("white")
base_color = sb.color_palette()[0]
# prepare the data for the plot
counts = bike_rentals.start_stations.values
order = bike_rentals.start_stations.index
x_locs = [0, 1, 2, 3, 4]
x_labels = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
clrs = [ 'darkturquoise' if (x > bike_rentals.start_stations.values.min()) else 'paleturquoise' for x in counts ]
# seaborn's bar plot
sb.barplot(x = order, y = counts, palette=clrs, alpha= 1, saturation = 0.8)
# improve plot aesthetics
plt.title('Classification of start stations based on Rental Traffic\n', weight = 'bold', fontsize = 16)
plt.xticks(x_locs, x_labels, rotation = 0, fontsize = 12)
plt.yticks([], [], rotation = 0, fontsize = 12)
plt.xlabel('\nBike rental traffic', fontsize = 14)
# plt.ylabel('Number of Stations', fontsize = 14)
# add annotations
# -------------------------------------------------------
# loop through each pair of locations and labels
for loc, count in zip(x_locs, counts):
pct_string = '{:0.0f}'.format(count)
# print the annotation based on bar length
if count < int(max(counts)/10):
plt.text(loc, count+int(max(counts)/20), pct_string, ha = 'center', color = 'black', weight = 'bold', fontsize = 13)
else:
plt.text(loc, count-int(max(counts)/10), pct_string, ha = 'center', color = 'white', fontsize = 13)
# -------------------------------------------------------
sb.despine(fig=None, ax=None, top=True, right=True, left=True, bottom=False, offset=None, trim=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.16.d Classification of start stations based on bike rental traffic.png', dpi=300, bbox_inches='tight')
end_station_id¶Column: end_station_idData type: Categorical, nominalPlot : HistogramShould the bike return traffic be calculated based on end_staion_id or the combination of end_lat and end_lon?
Exploration for the possible solution/approach to obtaing the rental traffic:
bikeshare.groupby([bikeshare['end_station_id'],
bikeshare['end_lat'],
bikeshare['end_lon']]).size().head(20)
From the above data it is evident that the end_stations has more than one combination of end_lat and end_lon.This is because of the geographical extension of the end_station over the zone. Hence the bike return traffic are to be calculated over the end_station_id but not the combination of end_lat and end_lon.
Distribution of end_stations based on their bike return traffic.
# find the bike returns based on end_station_id
end_stations = bikeshare.groupby([bikeshare['end_station_id']]).size().reset_index(name='returns')
end_stations.head()
# calculate the descriptive statistivs of the returns
end_stations['returns'].describe()
Plot the distribution of end stations return traffic:
sb.set_style('white')
# seaborn's distribution plot
sb.distplot(end_stations['returns'], kde = False, hist_kws = {'alpha' : 1}, color = 'salmon')
# improve plot aesthetics
plt.title('End stations - bike return traffic', fontsize = 14, weight = 'bold')
plt.xlabel('Bike returns', fontsize = 12)
plt.ylabel('Number of Stations', fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.17.a Distribution of end stations bike return traffic.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto the end station's bike return data.
Plot the Logarithmic distribution of end stations bike returns:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
sb.set_style('white')
# prepare the data for the plot
min_value = log_trans(end_stations['returns'].min())
max_value = log_trans(end_stations['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
# matplotlib's histogram
plt.hist(end_stations['returns'].apply(log_trans), bins = bin_edges, color = 'salmon')
# improve plot aesthetics
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int))
plt.title('Logarithmic distribution of end stations returns', fontsize = 14, weight = 'bold')
plt.xlabel('Number of bike returns', fontsize = 12)
plt.ylabel('Number of Stations', fontsize = 12)
# obtain yticks and convert them to integer values
y_locs, y_ticks = plt.yticks()
max_count = math.ceil(max(y_locs))
y_locs = np.arange(0, max_count+5, 5)
y_labels = y_locs
plt.yticks(y_locs, y_labels);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.17.b Distribution of end stations bike return traffic.png', dpi=300, bbox_inches='tight')
Classification of end_stations based on their bike returns traffic.
# display the count of end stations based on their bike return traffic
print('Total number of end stations'.ljust(30, ' '), ':', end_stations.shape[0], '\n')
print('Very Low traffic end stations'.ljust(30, ' '), ':', end_stations.query(' returns < 10 ').shape[0])
print('Low traffic end stations'.ljust(30, ' '), ':', end_stations.query(' returns >= 10 and returns < 100 ').shape[0])
print('Normal traffic end stations'.ljust(30, ' '), ':', end_stations.query(' returns >= 100 and returns < 1000 ').shape[0])
print('High traffic end stations'.ljust(30, ' '), ':', end_stations.query(' returns >= 1000 and returns < 10000 ').shape[0])
print('Very High traffic end stations'.ljust(30, ' '), ':', end_stations.query(' returns >= 10000 ').shape[0])
Create a dataframe based on bike rentals traffic and number of start stations associated with them.
returns = {'return_traffic' : pd.Series(['Very Low', 'Low', 'Normal', 'High', 'Very High']),
'end_stations' : pd.Series([end_stations.query(' returns < 10 ').shape[0],
end_stations.query(' returns >= 10 and returns < 100 ').shape[0],
end_stations.query(' returns >= 100 and returns < 1000 ').shape[0],
end_stations.query(' returns >= 1000 and returns < 10000 ').shape[0],
end_stations.query(' returns >= 10000 ').shape[0]])}
# create Dataframe.
bike_returns = pd.DataFrame(returns)
bike_returns
Pie chart:
def absolute_value(val):
a = np.round(val/100.*type_level_counts.sum(), 0)
return int(a)
# prepare the data for the plot
type_level_counts = bike_returns.end_stations.values
type_level_index = bike_returns.return_traffic.values
explode = (0.2, 0, 0, 0, 0)
colors = ['bisque', 'salmon', 'salmon', 'salmon', 'salmon']
# matplotlib's pie chart
plt.pie(type_level_counts, labels = type_level_index, startangle = 90,
counterclock = False, wedgeprops = {'width' : 0.4}, shadow=False,
explode=explode, colors=colors, textprops={'fontsize': 14},
autopct=absolute_value, labeldistance=1.1, pctdistance=0.8)
plt.title('End stations based on return traffic\n\n', fontsize = 14, weight = 'bold')
plt.axis('square');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.17.c Classification of end stations based on bike return traffic.png', dpi=300, bbox_inches='tight')
Bar chart:
# Assign grid and color palette as per requirement
sb.set_style("white")
base_color = sb.color_palette()[0]
# prepare the data for the plot
counts = bike_returns.end_stations.values
order = bike_returns.end_stations.index
x_locs = [0, 1, 2, 3, 4]
x_labels = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
clrs = [ 'salmon' if (x > bike_returns.end_stations.values.min()) else 'lightsalmon' for x in counts ]
# Seaborn's bar chart
sb.barplot(x = order, y = counts, palette=clrs, alpha= 1, saturation = 0.8)
# improve plot aesthetics
plt.title('Classification of end stations based on Return traffic\n', weight = 'bold', fontsize = 16)
plt.xticks(x_locs, x_labels, rotation = 0, fontsize = 12)
plt.yticks([], [], rotation = 0, fontsize = 12)
plt.xlabel('\nBike return traffic', fontsize = 14)
# plt.ylabel('Number of Stations', fontsize = 14)
# add annotations
# -------------------------------------------------------
# loop through each pair of locations and labels
for loc, count in zip(x_locs, counts):
pct_string = '{:0.0f}'.format(count)
# print the annotation based on bar length
if count <= int(max(counts)/10):
plt.text(loc, count+int(max(counts)/20), pct_string, ha = 'center', color = 'black', fontsize = 13)
else:
plt.text(loc, count-int(max(counts)/10), pct_string, ha = 'center', color = 'white', fontsize = 13)
# -------------------------------------------------------
sb.despine(fig=None, ax=None, top=True, right=True, left=True, bottom=False, offset=None, trim=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.17.d Classification of end stations based on bike return traffic.png', dpi=300, bbox_inches='tight')
Column: duration_minData type: Quantitative data, ContinuousPlot : HistogramExplore the optimal value of the bin size that best potrays the distribution of trip durations.
def dist_subplot(subplot, column, binsize):
# subplot with custom bin size
plt.subplot(1, 3, subplot)
max_count = bikeshare[column].value_counts().max()
tick_values = np.arange(0, bikeshare.shape[0]+100, 200000)
tick_names = ['{:0.1f} M'.format(v/1000000) for v in tick_values]
bin_edges = np.arange(0, bikeshare[column].max()+binsize, binsize)
# Seaborn's distribution plot
sb.distplot(bikeshare[column], bins = bin_edges, kde = False,
hist_kws = {'alpha' : 1}, color = 'cadetblue')
# improve plot aesthetics
plt.title('Duration distribution - bin size {}'.format(binsize), fontsize = 14, weight = 'bold')
plt.xlabel('\nDuration (minutes)', fontsize = 12)
plt.xticks(fontsize=10)
if subplot == 1:
plt.ylabel('Rentals (millions)\n', fontsize = 12)
plt.yticks(tick_values, tick_names, fontsize=10)
else:
plt.ylabel('')
plt.yticks(tick_values, [])
# check the most optimal bin size to plot the distribution of 'duration_min'
plt.figure(figsize = [16, 4])
# plot subplots with various bin_sizes
dist_subplot(subplot = 1, column = 'duration_min', binsize = 100)
dist_subplot(subplot = 2, column = 'duration_min', binsize = 500)
dist_subplot(subplot = 3, column = 'duration_min', binsize = 1000)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.18.a Distribution of trip durations - different bin sizes.png', dpi=300, bbox_inches='tight')
The above plot depicts that the bin size needs to be small, and require closer observation for better understanding of the data distribution. Plot the distribution of duration_min, within the limitation of 500 minutes for closer observation
sb.set_style('white')
bin_edges = np.arange(0, bikeshare['duration_min'].max()+10, 10)
# seaborn's distribution plot
sb.distplot(bikeshare['duration_min'], bins = bin_edges, kde = False,
hist_kws = {'alpha' : 1}, color = 'cadetblue')
# improve plot aesthetics
plt.title('Trip duration distribution - bin size 10', fontsize = 14, weight = 'bold')
plt.xlabel('Duration (minutes)', fontsize = 12)
plt.ylabel('Rentals (thousands)', fontsize = 12)
plt.xlim(0, 500);
# convert yticks into the units of thousands
y_locs, y_labels = plt.yticks()
y_tick_names = ['{:0.0f} K'.format(y_loc/1000) for y_loc in y_locs]
plt.yticks(y_locs, y_tick_names)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.18.b Distribution of trip durations - limited to 500 minutes.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto theduration_mindata.
Logarithmic transformation of trip durations:
Before applying Logarithmic transformation of the duration_min data, separate the data from the entries with 0 values.
# durations without zero entries
duration_non_zero = bikeshare.query(' duration_min != 0 ')[['duration_min']].copy()
duration_non_zero.head()
# logarithmic transformation of 'duration_min' data
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# prepare the data for the plot
min_value = log_trans(duration_non_zero['duration_min'].min())
max_value = log_trans(duration_non_zero['duration_min'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
# matplotlib's histogram
plt.hist(duration_non_zero['duration_min'].apply(log_trans), bins = bin_edges, color = 'cadetblue')
# improve plot aesthetics
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int))
plt.title('Logarithmic distribution of trip duration', fontsize = 14, weight = 'bold')
plt.xlabel('Duration (minutes)', fontsize = 12)
plt.ylabel('Rentals (thousands)', fontsize = 12)
# convert yticks into the units of thousands
y_locs, y_labels = plt.yticks()
y_tick_names = ['{:0.0f} K'.format(y_loc/1000) for y_loc in y_locs]
plt.yticks(y_locs, y_tick_names);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.18.c Logarithmic distribution of trip duration.png', dpi=300, bbox_inches='tight')
Divide the duration_min based on their distribution, for clear graphical plot.
# compute the descriptive statistics of durations distribution
bikeshare['duration_min'].describe()
As the
duration_minfeature is extracted from the start_station co-ordinates and end_station co-ordinates, the entries withround tripswill have0miles extracted asduration_min. Henceround tripsare ignored.
# breakdown the trip durations into categories
print('Total trips'.ljust(15, ' '), ':', bikeshare.shape[0], '\n')
print('Small trips'.ljust(15, ' '), ':', bikeshare.query(' duration_min > 0 and duration_min < 10 ').shape[0])
print('Normal trips'.ljust(15, ' '), ':', bikeshare.query(' duration_min >= 10 and duration_min < 100 ').shape[0])
print('Long trips'.ljust(15, ' '), ':', bikeshare.query(' duration_min >= 100 and duration_min < 1000 ').shape[0])
print('Very long trips'.ljust(15, ' '), ':', bikeshare.query(' duration_min >= 1000 ').shape[0])
durations = {'trip_type' : pd.Series(['Small', 'Normal', 'Long', 'Very Long']),
'trip_count' : pd.Series([bikeshare.query(' duration_min > 0 and duration_min < 10 ').shape[0],
bikeshare.query(' duration_min >= 10 and duration_min < 100 ').shape[0],
bikeshare.query(' duration_min >= 100 and duration_min < 1000 ').shape[0],
bikeshare.query(' duration_min >= 1000 ').shape[0]])}
# create Dataframe.
trip_durations = pd.DataFrame(durations)
trip_durations
Bar chart:
# Assign grid and color palette as per requirement
plt.figure(figsize = [12, 4])
sb.set_style("white")
base_color = 'cadetblue'
# plot pre-calculations
duration_order = ['Very Long', 'Long', 'Normal', 'Small']
time_order = ['[1000, )', '[100, 1000)', '[10, 100)', '(0 , 10)']
trip_counts = trip_durations.trip_count
trip_order = trip_durations.trip_type
x_tick_values = np.arange(0, trip_counts.max() + 50000, 50000)
x_tick_names = ['{:0.0f} K'.format(v/1000) for v in x_tick_values]
y_tick_values = np.arange(0, len(duration_order)+1, 1)
y_tick_names = duration_order
clrs = ['indianred', 'cadetblue', 'cadetblue', 'cadetblue']
# bar plot
sb.barplot(x = trip_counts, y = trip_order, order = duration_order, palette=clrs, alpha= 1, saturation = 1)
# plot - visual enhancements
plt.title('Distribution of Trip Durations', weight = 'bold', fontsize = 16)
plt.xticks(x_tick_values, x_tick_names, fontsize = 12)
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.xlabel('\nNumber of trips (thousands)', fontsize = 14)
plt.ylabel('Duration type (minutes)\n', fontsize = 14)
# Create a custom legend:
# -------------------------------------------------------
# Plot empty lists with the desired label
indents = [10, 13, 11, 13]
for duration, time, indent in zip(duration_order, time_order, indents):
plt.scatter([], [], c='k', alpha=0.3,
label= '{}'.format(duration).ljust(indent, ' ') + ' - ' + '{}'.format(time))
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=True, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.5), loc = 6, labelspacing=0.5,
title='Duration - minutes', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=0.5, handletextpad=0.5)
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.18.d Categorical distribution of Trip Durations.png', dpi=300, bbox_inches='tight')
Dataset limited under 30 min and 120 min:
# compute the descriptive statistcs of trip durations
bikeshare.duration_min.describe()
The calculations are influenced by the presence of outliers.
30 minutes, which constitute the 75% of the duration distribution. 120 minutes for long trip analysis.# Assign figure and color palette as per requirement
plt.figure(figsize=[18, 5])
sb.set_style('white')
base_color = 'cadetblue'
# left plot: dataset that has all entries
# -------------------------------------------------------
plt.subplot(1, 3, 1)
bin_edges = np.arange(0, bikeshare.duration_min.max()+100, 100)
# matpotlib's histogram
plt.hist(bikeshare['duration_min'], color = base_color, bins = bin_edges)
# improve pot aesthetics
plt.title('All trips\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
# convert the yticks into the units of thousands
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 100000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# -------------------------------------------------------
# middle plot: dataset that has entries under 120 minutes duration
# -------------------------------------------------------
plt.subplot(1, 3, 2)
duration_lim_120 = bikeshare.query(' duration_min <= 120 ')
bin_edges = np.arange(0, duration_lim_120.duration_min.max()+20, 10)
# matpotlib's histogram
plt.hist(duration_lim_120['duration_min'], color = base_color, bins = bin_edges)
# improve pot aesthetics
plt.title('Trips under 120 minutes\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
# convert the yticks into the units of thousands
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 100000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# -------------------------------------------------------
# right plot: dataset that has entries under 30 minutes duration
# -------------------------------------------------------
plt.subplot(1, 3, 3)
duration_lim_30 = bikeshare.query(' duration_min <= 30 ')
bin_edges = np.arange(0, duration_lim_30.duration_min.max()+2, 1)
# matpotlib's histogram
plt.hist(duration_lim_30['duration_min'], color = base_color, bins = bin_edges)
# improve pot aesthetics
plt.title('Trips under 30 minutes\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
# convert the yticks into the units of thousands
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 10000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.2, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Distribution of bike rental durations over datasets\n', fontsize = 18, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.18.e Distribution of bike rental durations over datasets.png', dpi=300, bbox_inches='tight')
# calculate average trip durations of various timeline limitations
overall_mean = math.ceil(bikeshare.duration_min.mean())
duration_lim_120_mean = math.ceil(bikeshare.query(' duration_min <= 120 ').duration_min.mean())
duration_lim_30_mean = math.ceil(bikeshare.query(' duration_min <= 30 ').duration_min.mean())
print('overall_mean'.ljust(25, ' '), ':', overall_mean, 'minutes')
print('duration_lim_120_mean'.ljust(25, ' '), ':', duration_lim_120_mean, 'minutes')
print('duration_lim_30_mean'.ljust(25, ' '), ':', duration_lim_30_mean, 'minutes')
# calculate most frequent trip durations of various timeline limitations
overall_mode = math.ceil(bikeshare.duration_min.mode())
duration_lim_120_mode = math.ceil(bikeshare.query(' duration_min <= 120 ').duration_min.mode())
duration_lim_30_mode = math.ceil(bikeshare.query(' duration_min <= 30 ').duration_min.mode())
print('overall_mode'.ljust(25, ' '), ':', overall_mode, 'minutes')
print('duration_lim_120_mode'.ljust(25, ' '), ':', duration_lim_120_mode, 'minutes')
print('duration_lim_30_mode'.ljust(25, ' '), ':', duration_lim_30_mode, 'minutes')
Tabular data of the average trip durations and most frequent trip durations based on the dataset limitation of durations:
| Dataset used to measure - | Avg trip duration (min) | Most frequent trip duration (min) |
|---|---|---|
| trips under 30 minutes | 12 | 6 |
| trips under 120 minutes | 18 | 6 |
| overall trips | 30 | 6 |
# convert the average trip durations into a dataframe
duration_avg = pd.DataFrame()
duration_avg['dataset_duration'] = ['< 30', '< 120', 'overall']
duration_avg['avg_trip_duration'] = [12, 18, 30]
duration_avg['freq_trip_duration'] = [6, 6, 6]
duration_avg
plt.figure(figsize = [12, 5])
# left plot: point plot
# =====================================================
# /////////////////////////////////////////////////////
plt.subplot(1, 2, 1)
sb.set_style('white')
sb.set_palette(palette = "GnBu", n_colors = 3, desat = None)
# seaborl's point plot
ax1 = sb.pointplot(data = duration_avg, x = 'dataset_duration', y = 'freq_trip_duration', color=sb.color_palette()[2])
# improve plot aesthetics
plt.title('Most freq Trip duration - Pointplot\n', weight = 'bold', fontsize = 14, color = 'dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
# convert the yticks into integer values
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+1, 1)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
# add annotations
# -------------------------------------------------------
locs, labels = plt.xticks()
duration_freq_counts = duration_avg.freq_trip_duration.values
duration_freq_max = duration_freq_counts.max()
clrs = [sb.color_palette()[0], sb.color_palette()[1], sb.color_palette()[2]]
# loop through each pair of locations
for loc, duration_freq_count, clr in zip(locs, duration_freq_counts, clrs):
try:
count = duration_freq_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (duration_freq_max/20), pct_string, ha = 'center', color = 'black', fontsize = 12,
bbox={'pad':1.9,'alpha':0.4,'color':'none','fc':clr})
# -------------------------------------------------------
sb.despine(top=True, right=True, bottom=False, left=False);
# =====================================================
# /////////////////////////////////////////////////////
# right plot: Bar chart
# =====================================================
# /////////////////////////////////////////////////////
plt.subplot(1, 2, 2)
sb.set_style('white')
sb.set_palette(palette = "GnBu", n_colors = 3, desat = None)
# seaborl's bar plot
g = sb.barplot(data = duration_avg, x = 'dataset_duration', y = 'freq_trip_duration')
# improve plot aesthetics
plt.title('Most freq Trip duration - Barchart\n', weight = 'bold', fontsize = 14, color = 'dimgrey')
plt.ylabel('', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim()); # set y-axis limits to be same as left plot
# convert the yticks into integer values
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+1, 1)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
# add annotations
# -------------------------------------------------------
locs, labels = plt.xticks()
duration_freq_counts = duration_avg.freq_trip_duration.values
duration_freq_max = duration_freq_counts.max()
clrs = ['gold' if (value > ((duration_freq_max*4)/5)) else 'limegreen' for value in duration_freq_counts]
# loop through each pair of locations
for loc, duration_freq_count, clr in zip(locs, duration_freq_counts, clrs):
try:
count = duration_freq_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (duration_freq_max/20), pct_string, ha = 'center', color = 'black', fontsize = 12)
# -------------------------------------------------------
sb.despine(top=True, right=True, bottom=False, left=False);
# =====================================================
# /////////////////////////////////////////////////////
plt.subplots_adjust(wspace=0.2, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Most frequent rental duration based on dataset trip durations\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.18.f Most frequent trip duration depending on datasets.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [12, 5])
# left plot: point plot
# =====================================================
# /////////////////////////////////////////////////////
plt.subplot(1, 2, 1)
sb.set_style('white')
sb.set_palette(palette = "GnBu", n_colors = 3, desat = None)
# Seaborn's point chart
ax1 = sb.pointplot(data = duration_avg, x = 'dataset_duration', y = 'avg_trip_duration', color=sb.color_palette()[2])
# improve plot aesthetics
plt.title('Avg. Trip duration - Pointplot\n', weight = 'bold', fontsize = 14, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
# convert the yticks into integer values
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
# add annotations
# -------------------------------------------------------
locs, labels = plt.xticks()
duration_avg_counts = duration_avg.avg_trip_duration.values
duration_avg_max = duration_avg_counts.max()
clrs = [sb.color_palette()[0], sb.color_palette()[1], sb.color_palette()[2]]
# loop through each pair of locations
for loc, duration_avg_count, clr in zip(locs, duration_avg_counts, clrs):
try:
count = duration_avg_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.2, count + int(duration_avg_max/10), pct_string, ha = 'center', color = 'black', fontsize = 12,
bbox={'pad':1.9,'alpha':0.3,'color':'none','fc':clr})
# -------------------------------------------------------
sb.despine(top=True, right=True, bottom=False, left=False);
# =====================================================
# /////////////////////////////////////////////////////
# right plot: Bar chart
# =====================================================
# /////////////////////////////////////////////////////
plt.subplot(1, 2, 2)
sb.set_style('white')
sb.set_palette(palette = "GnBu", n_colors = 3, desat = None)
# Seaborn's bar chart
g = sb.barplot(data = duration_avg, x = 'dataset_duration', y = 'avg_trip_duration')
# improve plot aesthetics
plt.title('Avg. Trip duration - Barchart\n', weight = 'bold', fontsize = 14, color = 'dimgrey')
plt.ylabel('', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim()); # set y-axis limits to be same as left plot
# convert the yticks into integer values
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
# add annotations
# -------------------------------------------------------
locs, labels = plt.xticks()
duration_avg_counts = duration_avg.avg_trip_duration.values
duration_avg_max = duration_avg_counts.max()
clrs = ['gold' if (value > ((duration_avg_max*4)/5)) else 'limegreen' for value in duration_avg_counts]
# loop through each pair of locations
for loc, duration_avg_count, clr in zip(locs, duration_avg_counts, clrs):
try:
count = duration_avg_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + int(duration_avg_max/30), pct_string, ha = 'center', color = 'black', fontsize = 12)
# -------------------------------------------------------
sb.despine(top=True, right=True, bottom=False, left=False);
# =====================================================
# /////////////////////////////////////////////////////
plt.subplots_adjust(wspace=0.2, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Average rental duration based on dataset trip durations\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.18.g Average trip duration depending on datasets.png', dpi=300, bbox_inches='tight')
Column: distance_milesData type: Quantitative data, ContinuousPlot : HistogramPlot the distribution of distance_miles:
# Prepare the data for the plot
sb.set_style('white')
max_value = bikeshare['distance_miles'].max()
bin_edges = np.arange(0, max_value+0.5, 0.5)
sb.distplot(bikeshare['distance_miles'], kde = False, bins = bin_edges, hist_kws = {'alpha' : 1}, color = 'dimgrey')
# improve plot aesthetics
plt.title('Distribution of trip distances', fontsize = 14, weight = 'bold')
plt.xlabel('Distance (miles)', fontsize = 12)
plt.ylabel('Rentals (thousands)', fontsize = 12);
# convert yticks into the units of thousands
y_locs, y_labels = plt.yticks()
y_tick_names = ['{:0.0f} K'.format(y_loc/1000) for y_loc in y_locs]
plt.yticks(y_locs, y_tick_names)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.19.a Distribution of trip distances.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto thedistance_milesdata.
Logarithmic transformation of trip distances:
Before applying
Logarithmic transformationof thedistance_milesdata, separate the data from the entries with0values.
# distances without zero entries
distance_non_zero = bikeshare.query(' distance_miles != 0 ')[['distance_miles']].copy()
distance_non_zero.head()
# logarithmic transformation of 'distance_miles' data
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# prepare the data for the plot
min_value = log_trans(distance_non_zero['distance_miles'].min())
max_value = log_trans(distance_non_zero['distance_miles'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
# matplotlib's histogram
plt.hist(distance_non_zero['distance_miles'].apply(log_trans), bins = bin_edges, color = 'dimgrey')
# improve plot aesthetics
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int))
plt.title('Logarithmic distribution of trip distance', fontsize = 14, weight = 'bold')
plt.xlabel('Distance (miles)', fontsize = 12)
plt.ylabel('Rentals (thousands)', fontsize = 12)
# convert yticks into the units of thousands
y_locs, y_labels = plt.yticks()
y_tick_names = ['{:0.0f} K'.format(y_loc/1000) for y_loc in y_locs]
plt.yticks(y_locs, y_tick_names);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.19.b Logarithmic distribution of trip distances.png', dpi=300, bbox_inches='tight')
Divide the distance_miles based on their distribution, for clear graphical plot.
# compute the descriptive statistics of distances distribution
bikeshare['distance_miles'].describe()
As the distance (displacement) is dependent on the start_station co-ordinates and end_station co-ordinates, the entries with
round tripswill have0miles extracted asdistance_miles.
# breakdown the trip distances into categories
print('Total trips'.ljust(20, ' '), ':', bikeshare.shape[0], '\n')
print('Round trips'.ljust(20, ' '), ':', bikeshare.query(' distance_miles == 0 ').shape[0])
print('Very small trips'.ljust(20, ' '), ':', bikeshare.query(' distance_miles > 0 and distance_miles < 0.1 ').shape[0])
print('Small trips'.ljust(20, ' '), ':', bikeshare.query(' distance_miles >= 0.1 and distance_miles < 0.5 ').shape[0])
print('Normal trips'.ljust(20, ' '), ':', bikeshare.query(' distance_miles >= 0.5 and distance_miles < 1 ').shape[0])
print('Long trips'.ljust(20, ' '), ':', bikeshare.query(' distance_miles >= 1 and distance_miles < 10 ').shape[0])
print('Very long trips'.ljust(20, ' '), ':', bikeshare.query(' distance_miles >= 10 ').shape[0])
distances = {'trip_type' : pd.Series(['Round Trip', 'Very Small', 'Small', 'Normal', 'Long', 'Very Long']),
'trip_count' : pd.Series([bikeshare.query(' distance_miles == 0 ').shape[0],
bikeshare.query(' distance_miles > 0 and distance_miles < 0.1 ').shape[0],
bikeshare.query(' distance_miles >= 0.1 and distance_miles < 0.5 ').shape[0],
bikeshare.query(' distance_miles >= 0.5 and distance_miles < 1 ').shape[0],
bikeshare.query(' distance_miles >= 1 and distance_miles < 10 ').shape[0],
bikeshare.query(' distance_miles >= 10 ').shape[0]])}
# create Dataframe.
trip_distances = pd.DataFrame(distances)
trip_distances
Bar chart:
# Assign grid and color palette as per requirement
plt.figure(figsize = [32, 8])
sb.set_style("white")
# plot pre-calculations
base_color = sb.color_palette()[0]
dist_order = ['Very Long', 'Long', 'Normal', 'Small', 'Very Small', 'Round Trip']
time_order = ['[10, )', '[1, 10)', '[0.5, 1)', '[0.1, 0.5)', '(0, 0.1)', '[0]']
trip_counts = trip_distances.trip_count
trip_order = trip_distances.trip_type
x_tick_values = np.arange(0, trip_counts.max() + 50000, 50000)
x_tick_names = ['{:0.0f} K'.format(v/1000) for v in x_tick_values]
y_tick_values = [0, 1, 2, 3, 4, 5]
y_tick_names = dist_order
clrs = ['indianred', 'dimgrey', 'dimgrey', 'dimgrey', 'dimgrey', 'dimgrey']
# bar plot
sb.barplot(x = trip_counts, y = trip_order, order = dist_order, palette=clrs, alpha= 1, saturation = 1)
# plot - visual enhancements
plt.title('Categorical distribution of Trip distances\n', weight = 'bold', fontsize = 30)
plt.xticks(x_tick_values, x_tick_names, fontsize = 22)
plt.yticks(y_tick_values, y_tick_names, fontsize = 22)
plt.xlabel('\nNumber of trips (thousands)', fontsize = 26)
plt.ylabel('Distance type (miles)\n', fontsize = 26)
# Create a legend:
# -------------------------------------------------------
indents = [10, 13, 12, 14, 11, 11]
# Plot empty lists with the desired label
for dist, time, indent in zip(dist_order, time_order, indents):
plt.scatter([], [], c='k', alpha=0.3,
label= '{}'.format(dist).ljust(indent, ' ') + ' - ' + '{}'.format(time))
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=True, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.5), loc = 6, labelspacing=0.5,
title='Duration - minutes', title_fontsize=24, fontsize=22, facecolor='white',
markerfirst=True, handlelength=0.5, handletextpad=0.5)
# -------------------------------------------------------
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.19.c Categorical distribution of Trip distances.png', dpi=300, bbox_inches='tight')
Column: fareData type: Quantitative data, ContinuousPlot : Histogram# compute the descriptive statistcs of fare
bikeshare.fare.describe()
Explore the optimal value of the bin size that best potrays the distribution of trip fares.
def dist_subplot(subplot, column, binsize):
# subplot with custom bin size
plt.subplot(1, 3, subplot)
max_count = bikeshare[column].value_counts().max()
y_tick_values = np.arange(0, bikeshare.shape[0]+100, 200000)
y_tick_names = ['{:0.1f} M'.format(v/1000000) for v in y_tick_values]
bin_edges = np.arange(0, bikeshare[column].max()+binsize, binsize)
# Seaborn's distribution plot
sb.distplot(bikeshare[column], bins = bin_edges, kde = False,
hist_kws = {'alpha' : 0.8}, color = base_color)
# improve plot aesthetics
plt.title('Fare distribution - bin size {}'.format(binsize), fontsize = 15, weight = 'bold')
plt.xlabel('\nFare (dollars)', fontsize = 13)
plt.xticks(fontsize=12)
if subplot == 1:
plt.ylabel('Rentals (millions)\n', fontsize = 13)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
else:
plt.ylabel('')
plt.yticks(y_tick_values, [])
# Assign palette as per requirement
sb.set_palette('colorblind', n_colors=10, desat = 0.6)
base_color = sb.color_palette()[8]
# check the most optimal bin size to plot the distribution of 'duration_min'
plt.figure(figsize = [16, 4])
# plot subplots with various bin_sizes
dist_subplot(subplot = 1, column = 'fare', binsize = 1)
dist_subplot(subplot = 2, column = 'fare', binsize = 10)
dist_subplot(subplot = 3, column = 'fare', binsize = 100)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.20.a Distribution of trip fares - different bin sizes.png', dpi=300, bbox_inches='tight')
The above plot depicts that the bin size needs to be small, and require closer observation for better understanding of the data distribution. Plot the distribution of trip fares, limited to
10 dollar(which constitutes more than 75% of the data) for closer observation.
# Plot the distribution of bike fares, within the limitation of 10 dollar for closer observation
# Assign palette as per requirement
sb.set_palette('colorblind', n_colors=10, desat = 0.8)
base_color = sb.color_palette()[8]
sb.set_style('white')
# prepare the data for the plot
y_tick_values = np.arange(0, bikeshare.shape[0]+100, 200000)
y_tick_names = ['{:0.1f} M'.format(v/1000000) for v in y_tick_values]
bin_edges = np.arange(0, 10+1, 1)
# seaborn's distribution plot
sb.distplot(bikeshare['fare'], bins = bin_edges, kde = False,
hist_kws = {'alpha' : 0.8}, color = base_color)
# improve plot aesthetics
# -------------------------------------------------------
plt.title('Trip fare distribution - bin size 1', fontsize = 16, weight = 'bold')
plt.xlabel('\nFare (Dollars)', fontsize = 14)
plt.ylabel('Rentals (million)\n', fontsize = 14)
plt.xlim(0, 10)
# obtain y_ticks and convert them to the units of millions
y_tick_locs = []
locs, labels = plt.yticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
y_tick_locs.append(int(loc))
y_tick_names = ['{:0.1f} M'.format(loc/1000000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
x_tick_locs = np.arange(0, 10+1, 1)
x_tick_names = ['{:0.0f}'.format(loc) for loc in x_tick_locs]
plt.xticks(x_tick_locs, x_tick_names,fontsize=12);
# -------------------------------------------------------
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.20.b Distribution of trip fares - limited to 10 dollar.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto thefaredata.
Logarithmic distribution of trip fares:
Before applying Logarithmic transformation of the fare data, separate the data from the entries with 0 values.
# fares without zero entries (base fares)
fare_non_zero = bikeshare.query(' fare != 0 ')[['fare']].copy()
fare_non_zero.head()
# logarithmic transformation of 'fare' data
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_palette('colorblind', n_colors=10, desat = 0.8)
base_color = sb.color_palette()[8]
# prepare the data for the plot
min_value = log_trans(fare_non_zero['fare'].min())
max_value = log_trans(fare_non_zero['fare'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
# matplotlib's histogram
plt.hist(fare_non_zero['fare'].apply(log_trans), bins = bin_edges, color = base_color, alpha=0.8)
# improve plot aesthetics
# -------------------------------------------------------
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.title('Logarithmic distribution of trip fares\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nFare (Dollars)', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
# obtain y_ticks and convert them to a multiple of millions
y_tick_locs = []
locs, labels = plt.yticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
y_tick_locs.append(int(loc))
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12);
# -------------------------------------------------------
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.20.c Logarithmic distribution of trip fares.png', dpi=300, bbox_inches='tight')
Divide the fare based on their distribution, for clear graphical plot.
# compute the descriptive statistics of trip fares
bikeshare['fare'].describe()
Breakdown the trip fares into customized sections.
# divide the fare into customized sections
bin = [-1,0,5,10,50,100,600]
#use pd.cut function to attribute the values into its specific bins
fare = pd.cut(bikeshare['fare'],bin)
fare = fare.to_frame()
fare.columns = ['fare_sections']
fare.sample(10)
Count plot:
# Assign palette as per requirement
sb.set_palette('colorblind', n_colors=10, desat = 0.8)
base_color = sb.color_palette()[8]
# Seaborn's count plot
sb.countplot(data = fare, x = 'fare_sections', color = base_color, alpha= 0.8, saturation = 1)
# improve plot aesthetics
# -------------------------------------------------------
plt.title('Sectional distribution of trip fares\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nFare (Dollars)', fontsize = 14)
plt.ylabel('Rentals (million)\n', fontsize = 14)
# obtain y_ticks and convert them to a multiple of millions
y_tick_locs = []
locs, labels = plt.yticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
y_tick_locs.append(int(loc))
y_tick_names = ['{:0.1f} M'.format(loc/1000000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
# assigning xticks here will interfere with annotations
# -------------------------------------------------------
# add annotations
# -------------------------------------------------------
n_points = fare.shape[0]
fare_counts = fare.fare_sections.value_counts()
fare_max = fare_counts.max()
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label in zip(locs, labels):
# get the text property for the label to get the correct count
str = (label.get_text()[-4:-1])
num = [int(s) for s in str.split() if s.isdigit()]
if num[0] in fare_counts.index[0]:
count = fare_counts.values[0]
elif num[0] in fare_counts.index[1]:
count = fare_counts.values[1]
elif num[0] in fare_counts.index[2]:
count = fare_counts.values[2]
elif num[0] in fare_counts.index[3]:
count = fare_counts.values[3]
elif num[0] in fare_counts.index[4]:
count = fare_counts.values[4]
else:
count = 0
if (100*count/n_points) < 0.1:
pct_string = '< 0.1%'
else:
pct_string = '{:0.1f}%'.format(100*count/n_points)
# print the annotation depending on the bar length
if count < (fare_max/10):
plt.text(loc, count+(fare_max/25), pct_string, ha = 'center', color = 'black', weight = 'normal', fontsize = 12)
else:
plt.text(loc, count-(fare_max/10), pct_string, ha = 'center', color = 'black', weight = 'normal', fontsize = 12)
# -------------------------------------------------------
# get xticks and change the first categorical expression tto just zero dollars
x_labels_new = ['[0]']
# get the current tick locations and labels
x_locs, x_labels = plt.xticks()
for x_label in x_labels[1:]:
x_labels_new.append(x_label.get_text())
plt.xticks(x_locs, x_labels_new, fontsize=12)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.1.20.d Sectional distribution of trip fares.png', dpi=300, bbox_inches='tight')
In the previous plot: 3.1.4 Distribution of bike rentals based on fare type, the base fares consists of 83%, however the percentage of bike rentals that paid Zero dollars is 74.5%. This is because unlike other pass types, the Walk-up pass type has a fare charge of 1 dollar for Base fare type. Hence the percentage of rentals that are charged with zero dollars are less than the percentage of rentals with fare type as Base.
# Seaborn's pair plot
g = sb.pairplot(bikeshare[['duration_min', 'distance_miles', 'fare']])
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Bikeshare - numerical varibles Pairplot\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.1.a Pairplot of numerical varibles .png', dpi=300, bbox_inches='tight')
# Display correlation matix of the dataset
bikeshare.corr()
bikeshare_corr = bikeshare.corr()
# correlation plot
plt.figure(figsize = [8, 6])
sb.heatmap(bikeshare.corr(), annot = True, fmt = '.2f', cmap = 'vlag_r', center = 0)
plt.title('Bikeshare dataset - correlation map\n', weight = 'bold', fontsize = 14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.1.b Bikeshare dataset - correlation map.png', dpi=300, bbox_inches='tight')
start_lat and start_lon columns:¶Column: start_lat, start_lonData type: numerical data, continuousPlot : Scatter plotExplore the geographical distribution of bike rentals based on lattitude and longitude.
sb.set_palette(palette = "deep", n_colors = 20, desat = None)
sb.regplot(data = bikeshare, x = 'start_lat', y = 'start_lon',
fit_reg = False, scatter_kws = {'alpha' : 1/5})
plt.title('Start station geo-locations', fontsize = 14, weight = 'bold')
plt.xlabel('start_station_latitude', fontsize = 12)
plt.ylabel('start_station_longitude', fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.2.a Start station geo-locations.png', dpi=300, bbox_inches='tight')
Explore the descriptive statistics of the start_latitude and start_longitude.
bikeshare[['start_lat', 'start_lon']].describe()
Heat Map:
Plot the grographical distribution of start stations using heat map.
# prepare the data for the plot
x_bins = np.arange(bikeshare.start_lat.min(), bikeshare.start_lat.max()+0.01, 0.01)
y_bins = np.arange(bikeshare.start_lon.min(), bikeshare.start_lon.max()+0.01, 0.01)
# matplotlib's heatmap
plt.hist2d(data = bikeshare, x = 'start_lat', y = 'start_lon',
cmin = 0.5, cmap = 'viridis_r', bins = [x_bins, y_bins])
# improve plot aesthetics
plt.title('Start station geo-distribution', fontsize = 14, weight = 'bold')
plt.xlabel('start station latitude', fontsize = 12)
plt.ylabel('start station longitude', fontsize = 12)
plt.colorbar();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.2.b Start station geo-distribution using heatmap.png', dpi=300, bbox_inches='tight')
The above plot depicts that the bike rentals are clusterd at specific locations. Re-plot the graph with larger bin size and annotations for more clear interpretation.
plt.figure(figsize = [8, 4])
h2d = plt.hist2d(data = bikeshare, x = 'start_lat', y = 'start_lon', cmin = 0.5, cmap = 'viridis_r')
plt.title('Start station geo-distribution', fontsize = 14, weight = 'bold')
plt.xlabel('start station latitude', fontsize = 12)
plt.ylabel('start station longitude', fontsize = 12)
# getting individual elements
counts = h2d[0]
x_bins = h2d[1]
y_bins = h2d[2]
counts_list = []
x_bin_diff_list = []
y_bin_diff_list = []
for i in range(counts.shape[0]):
for j in range(counts.shape[1]):
c = counts[i,j]
# eliminate nan and append only if c does not exist in counts_list
if c not in counts_list and not np.isnan(c):
counts_list.append(c)
for bin in range(len(x_bins)-1):
x_bin_diff = x_bins[bin+1] - x_bins[bin]
if x_bin_diff not in x_bin_diff_list:
x_bin_diff_list.append(x_bin_diff)
for bin in range(len(y_bins)-1):
y_bin_diff = y_bins[bin+1] - y_bins[bin]
if y_bin_diff not in y_bin_diff_list:
y_bin_diff_list.append(y_bin_diff)
counts_mean = np.mean(counts_list)
x_bin_size = max(x_bin_diff_list)
y_bin_size = max(y_bin_diff_list)
for i in range(counts.shape[0]):
for j in range(counts.shape[1]):
c = counts[i,j]
if c >= counts_mean: # increase visibility on darkest cells
plt.text(x_bins[i] + (x_bin_size/2), y_bins[j] + (y_bin_size/2), int(c),
ha = 'center', va = 'center', color = 'white', fontsize = 9)
elif c > 0:
plt.text(x_bins[i] + (x_bin_size/2), y_bins[j] + (y_bin_size/2), int(c),
ha = 'center', va = 'center', color = 'black', fontsize = 9)
plt.colorbar();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.2.c Start station geo-distribution using heat map.png', dpi=300, bbox_inches='tight')
As end_lat and end_lon are associated with the same bike stations there is no necessity to explore the distribution of the end_stations geographical distribution.
start_station_id and end_station_id columns:¶Column: start_station_id, start_station_idData type: numerical data, continuousPlot : Scatter plotExplore the bike rental traffic between start_stations and end_stations:
bikeshare.groupby([bikeshare['start_station_id'],
bikeshare['end_station_id']]).size()
sb.regplot(data = bikeshare, x = 'start_station_id', y = 'end_station_id', fit_reg = False, scatter_kws = {'alpha' : 1/10});
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.3 Bike rental traffic between start stations and end stations.png', dpi=300, bbox_inches='tight')
It appears some station pairs does not have bike activity between them.
start_station_id and trip_type columns:¶Column: start_station_id, trip_typeData type: numerical data, continuousPlot : Scatter plotDistribution of start_stations rental traffic based on trip_type.
start_stations = bikeshare.groupby([bikeshare['start_station_id'],
bikeshare['trip_type']]).size().reset_index(name='rentals')
start_stations.head()
start_stations['rentals'].describe()
calculate max value of the count to estimate bin size of the preceeding plot
start_stations['rentals'].max()
As the max value is around
32000, let the bin size be1000.
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'turquoise']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# prepare data for plotting
max_value = start_stations['rentals'].max()
bin_edges = np.arange(0, max_value+500, 500)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = start_stations, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1.2, hue = 'trip_type')
g.map(plt.hist, "rentals", bins = bin_edges)
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Distribution of Start stations Rental traffic based on trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_max = (max_value/1000)
x_tick_locs = np.arange(0, x_tick_max+1, 1)
x_tick_names = ['{:0.0f}K'.format(loc) for loc in x_tick_locs]
g.set_xticklabels(x_tick_names, size=12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike rentals (thousands)', size = 14)
g.set_ylabels('Station count\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.4.a Distribution of Start stations Rental traffic based on trip type.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto the start station's bike rental data.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'turquoise']
sb.set_palette(flatui, desat = 0.6)
# prepare data for plotting
start_stations['log_count'] = start_stations['rentals'].apply(log_trans)
min_value = log_trans(start_stations['rentals'].min())
max_value = log_trans(start_stations['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = start_stations, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, hue = 'trip_type')
g.map(plt.hist, "log_count", bins = bin_edges)
g.fig.subplots_adjust(top=0.7)
g.fig.suptitle('Logarithmic distribution of Start stations Rental traffic\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_names = log_trans(tick_locs, inverse = True).astype(int)
g.set_xticklabels(x_tick_names, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike rentals', size = 14)
g.set_ylabels('Station count\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.4.b Logarithmic distribution of Start stations Rental traffic based on trip type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', desat = 0.8)
min_value = log_trans(start_stations[start_stations['trip_type'] == "One Way"]['rentals'].min())
max_value = log_trans(start_stations[start_stations['trip_type'] == "One Way"]['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
plt.hist(start_stations[start_stations['trip_type'] == "One Way"]['rentals'].apply(log_trans),
bins = bin_edges, color = sb.color_palette()[0], alpha=0.6, label = 'One Way')
plt.hist(start_stations[start_stations['trip_type'] == "Round Trip"]['rentals'].apply(log_trans),
bins = bin_edges, color = sb.color_palette()[1], alpha=0.6, label = 'Round Trip')
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nBike rentals', fontsize = 14)
plt.ylabel('Station count\n', fontsize = 14)
plt.title('Logarithmic distribution of Start stations Rental traffic\n', fontsize = 16, weight = 'bold')
plt.legend(bbox_to_anchor=(1.4, 1), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Trip type', title_fontsize=14, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, loc = 'upper right');
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.4.c Logarithmic distribution of Start stations Rental traffic based on trip type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', desat = 0.8)
min_value = log_trans(start_stations[start_stations['trip_type'] == "One Way"]['rentals'].min())
max_value = log_trans(start_stations[start_stations['trip_type'] == "One Way"]['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
sb.distplot(start_stations[start_stations['trip_type'] == "One Way"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8, "shade": True},
color = sb.color_palette()[0], label = 'One Way', hist=False)
sb.distplot(start_stations[start_stations['trip_type'] == "Round Trip"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8, "shade": True},
color = sb.color_palette()[1], label = 'Round Trip', hist=False)
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nBike rentals', fontsize = 14)
plt.ylabel('Kernal Density Estimation\n', fontsize = 14)
plt.title('Logarithmic distribution of Start stations Rental traffic\n', fontsize = 16, weight = 'bold')
plt.legend(bbox_to_anchor=(1.3, 1), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Trip type', title_fontsize=14, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, loc = 'upper right');
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.4.d Kernal density estimation of Start stations Rental traffic based on trip type.png', dpi=300, bbox_inches='tight')
print('start stations with "One Way" trips'.ljust(35, ' '), ':',
start_stations.query(' trip_type == "One Way" ').shape[0], '\n')
print('Very Low traffic start stations'.ljust(35, ' '), ':',
start_stations.query(' trip_type == "One Way" and rentals < 10 ').shape[0])
print('Low traffic start stations'.ljust(35, ' '), ':',
start_stations.query(' trip_type == "One Way" and rentals >= 10 and rentals < 100 ').shape[0])
print('Normal traffic start stations'.ljust(35, ' '), ':',
start_stations.query(' trip_type == "One Way" and rentals >= 100 and rentals < 1000 ').shape[0])
print('High traffic start stations'.ljust(35, ' '), ':',
start_stations.query(' trip_type == "One Way" and rentals >= 1000 and rentals < 10000 ').shape[0])
print('Very High traffic start stations'.ljust(35, ' '), ':',
start_stations.query(' trip_type == "One Way" and rentals >= 10000 ').shape[0])
print('start stations with "Round Trip" trips'.ljust(40, ' '), ':',
start_stations.query(' trip_type == "Round Trip" ').shape[0], '\n')
print('Very Low traffic start stations'.ljust(40, ' '), ':',
start_stations.query(' trip_type == "Round Trip" and rentals < 10 ').shape[0])
print('Low traffic start stations'.ljust(40, ' '), ':',
start_stations.query(' trip_type == "Round Trip" and rentals >= 10 and rentals < 100 ').shape[0])
print('Normal traffic start stations'.ljust(40, ' '), ':',
start_stations.query(' trip_type == "Round Trip" and rentals >= 100 and rentals < 1000 ').shape[0])
print('High traffic start stations'.ljust(40, ' '), ':',
start_stations.query(' trip_type == "Round Trip" and rentals >= 1000 and rentals < 10000 ').shape[0])
print('Very High traffic start stations'.ljust(40, ' '), ':',
start_stations.query(' trip_type == "Round Trip" and rentals >= 10000 ').shape[0])
bin = [0,10,100,1000,10000,100000]
#use pd.cut function can attribute the values into its specific bins
category = pd.cut(start_stations['rentals'],bin)
category = category.to_frame()
category.columns = ['rental_bins']
category['trip_type'] = start_stations['trip_type']
category['start_station_id'] = start_stations['start_station_id']
category.reindex(columns=['start_station_id', 'trip_type', 'rental_bins'])
category.head()
category.rental_bins.sort_values(ascending=True).unique()
%%time
def label_race(df):
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[0],'traffic'] = 'Very Low'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[1],'traffic'] = 'Low'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[2],'traffic'] = 'Normal'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[3],'traffic'] = 'High'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[4],'traffic'] = 'Very High'
df = category
label_race(df)
level_order = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['traffic'] = df['traffic'].astype(ordered_cat)
category.traffic.value_counts()
def plot_pie(cat_type):
df = category[category['trip_type'] == cat_type]
subdf = df.groupby([df['traffic']]).size().reset_index(name='stations')
df_to_plot = subdf.loc[lambda df_to_plot: subdf['stations'] != 0]
type_level_counts = df_to_plot.stations
type_level_index = df_to_plot.traffic.sort_values(ascending=True).unique()
clrs = [ sb.color_palette()[0] if (x > type_level_counts.min()) else sb.color_palette()[1] for x in type_level_counts ]
explode = [ 0 if (x > type_level_counts.min()) else 0.2 for x in type_level_counts ]
p, tx, autotexts = plt.pie(type_level_counts, labels = type_level_index, startangle = 90,
counterclock = False, wedgeprops = {'width' : 0.4}, shadow=False,
explode=explode, colors=clrs, textprops={'fontsize': 14},
autopct='', labeldistance=1.1, pctdistance=0.8)
plt.title(cat_type+ '\n\n\n', weight='bold', color='grey', fontsize=14)
plt.axis('square');
for i, a in enumerate(autotexts):
a.set_text("{}".format(type_level_counts[i]))
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'paleturquoise']
sb.set_palette(flatui, desat = 0.6)
Ncount = len(category.trip_type.unique())
types = category.trip_type.unique()
plt.figure(figsize = [12, 6])
# subplot
plt.subplot(1, 2, 1)
plot_pie('One Way')
# subplot
plt.subplot(1, 2, 2)
plot_pie('Round Trip')
plt.suptitle('Classification of Start station traffic based on trip type', fontsize = 16, weight = 'bold')
plt.subplots_adjust(top=0.7)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.4.e pie chart classification of Start stations Rental traffic based on trip type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'turquoise']
sb.set_palette(flatui, desat = 0.6)
plt.figure(figsize = [6, 4])
# plot Facet Grid
g = sb.FacetGrid(data = category, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.countplot, 'traffic', order = category.traffic.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Start station rentals traffic based on trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}\n', weight = 'bold', size = 14, color = 'grey')
# improve plot aesthetics
g.set_yticklabels(size = 10)
g.set_xticklabels(size = 10)
g.set_xlabels('\nRental traffic', size = 13)
g.set_ylabels('Start stations\n', size = 13)
g.add_legend(bbox_to_anchor=(1.05, 0.7), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Trip type', title_fontsize=14, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
plt.subplots_adjust(top=0.7)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# add annotations
# -------------------------------------------------------
#loop over the different figures in the grid
for ax in g.axes.ravel():
# loop over the different bars in each figure
for p in ax.patches:
ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points');
# -------------------------------------------------------
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.4.f facet grid classification of Start stations Rental traffic based on trip type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'paleturquoise']
sb.set_palette('GnBu', desat = 0.6)
plt.figure(figsize = [6, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'trip_type', hue = 'traffic', alpha = 0.8, saturation = 0.8)
# improve plot aesthetics
plt.title('Classification of Start station traffic based on trip type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\n Rental traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# -------------------------------------------------------
separators = [0.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.4.g Classification of Start stations Rental traffic based on trip type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'paleturquoise']
sb.set_palette(flatui, desat = 0.6)
plt.figure(figsize = [6, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'traffic', hue = 'trip_type', alpha = 0.8, saturation = 0.8)
# improve plot aesthetics
plt.title('Classification of Start station traffic based on trip type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\n Rental traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (0.8, 0.9), loc = 6, labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# -------------------------------------------------------
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.4.h Classification of Start stations Rental traffic based on trip type.png', dpi=300, bbox_inches='tight')
start_station_id and bike_type columns:¶Column: start_station_id, start_station_idData type: numerical data, continuousPlot : Scatter plotDistribution of start_stations rental traffic based on bike_type.
start_stations = bikeshare.groupby([bikeshare['start_station_id'],
bikeshare['bike_type']]).size().reset_index(name='rentals')
start_stations.head(10)
calculate max value of the count to estimate bin size of the preceeding plot
start_stations['rentals'].max()
As the max value is around
18000, let the bin size be500.
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('GnBu_d', n_colors=5, desat=0.6)
# prepare data for plotting
max_value = start_stations['rentals'].max()
bin_edges = np.arange(0, max_value+500, 500)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = start_stations, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, hue = 'bike_type')
g.map(plt.hist, "rentals", bins = bin_edges)
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of Start stations Rental traffic based on bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_max = (max_value/1000)
x_tick_locs = np.arange(0, x_tick_max+1, 1)
x_tick_names = ['{:0.0f}K'.format(loc) for loc in x_tick_locs]
g.set_xticklabels(x_tick_names, size=12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike rentals (thoudands)', size = 14)
g.set_ylabels('Station count\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.5.a Distribution of Start stations Rental traffic based on bike type.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto the start station's bike rental data.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('GnBu_d', n_colors=5, desat=0.6)
# prepare data for plotting
start_stations['log_count'] = start_stations['rentals'].apply(log_trans)
min_value = log_trans(start_stations['rentals'].min())
max_value = log_trans(start_stations['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = start_stations, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, hue = 'bike_type')
g.map(plt.hist, "log_count", bins = bin_edges)
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Logarithmic distribution of Start stations Rental traffic', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_names = log_trans(tick_locs, inverse = True).astype(int)
g.set_xticklabels(x_tick_names, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nNumber of bike rentals', size = 14)
g.set_ylabels('Station count\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.5.b Logarithmic distribution of Start stations Rental traffic based on bike type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', n_colors = 4, desat = 0.8)
max_value = log_trans(start_stations['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
sb.distplot(start_stations[start_stations['bike_type'] == "unknown"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[0], label = 'unknown', hist=False)
sb.distplot(start_stations[start_stations['bike_type'] == "Standard"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[1], label = 'Standard', hist=False)
sb.distplot(start_stations[start_stations['bike_type'] == "Electric"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[2], label = 'Electric', hist=False)
sb.distplot(start_stations[start_stations['bike_type'] == "Smart"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[3], label = 'Smart', hist=False)
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nNumber of bike rentals', fontsize = 14)
plt.ylabel('Kernal Density Estimation\n', fontsize = 14)
plt.title('Kernal density estimation of Start stations Rental traffic\n', fontsize = 16, weight = 'bold')
plt.legend(scatterpoints=1,frameon=True, fancybox=True, loc = 'upper left',
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.5.c Kernal density estimation of Start stations Rental traffic based on bike type.png', dpi=300, bbox_inches='tight')
bin = [0,10,100,1000,10000,100000]
#use pd.cut function can attribute the values into its specific bins
category = pd.cut(start_stations['rentals'],bin)
category = category.to_frame()
category.columns = ['rental_bins']
category['bike_type'] = start_stations['bike_type']
category['start_station_id'] = start_stations['start_station_id']
category = category.reindex(columns=['start_station_id', 'bike_type', 'rental_bins'])
category.head()
category.rental_bins.sort_values(ascending=True).unique()
%%time
def label_race(df):
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[0],'traffic'] = 'Very Low'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[1],'traffic'] = 'Low'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[2],'traffic'] = 'Normal'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[3],'traffic'] = 'High'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[4],'traffic'] = 'Very High'
df = category
label_race(df)
level_order = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['traffic'] = df['traffic'].astype(ordered_cat)
category.traffic.value_counts()
def plot_pie(cat_type):
df = category[category['bike_type'] == cat_type]
subdf = df.groupby([df['traffic']]).size().reset_index(name='stations')
df_to_plot = subdf.loc[lambda df_to_plot: subdf['stations'] != 0]
type_level_counts = df_to_plot.stations
type_level_index = df_to_plot.traffic.sort_values(ascending=True).unique()
clrs = [ sb.color_palette()[3],
sb.color_palette()[0],
sb.color_palette()[0],
sb.color_palette()[0],
sb.color_palette()[0] ]
explode = [ 0.5, 0, 0, 0, 0 ]
plt.pie(type_level_counts, labels = type_level_index, startangle = 90,
counterclock = False, wedgeprops = {'width' : 0.4}, shadow=False,
explode=explode[0:len(type_level_index)], colors=clrs[0:len(type_level_index)],
textprops={'fontsize': 12}, autopct='%1.0f%%', labeldistance=1.1, pctdistance=0.8)
plt.title(cat_type+ '\n\n', weight='bold', color='grey', fontsize=14)
plt.axis('square');
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'mediumturquoise', 'turquoise', 'paleturquoise']
sb.set_palette(flatui, n_colors=4, desat=0.6)
Ncount = len(category.bike_type.unique())
types = category.bike_type.unique()
plt.figure(figsize = [18, 6])
# subplot
plt.subplot(1, 4, 1)
plot_pie('unknown')
# subplot
plt.subplot(1, 4, 2)
plot_pie('Standard')
# subplot
plt.subplot(1, 4, 3)
plot_pie('Electric')
# subplot
plt.subplot(1, 4, 4)
plot_pie('Smart')
plt.suptitle('Classification of Start station traffic based on bike type', fontsize = 16, weight = 'bold')
plt.subplots_adjust(top=0.8)
plt.subplots_adjust(wspace=0.1, hspace=0.2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.5.d Pie chart classification of Start stations Rental traffic based on bike type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('GnBu_d', n_colors=5, desat=0.6)
# plot Facet Grid
g = sb.FacetGrid(data = category, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, hue = 'bike_type')
g.map(sb.countplot, 'traffic', order = category.traffic.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Start station rentals traffic based on bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 16, color = 'grey')
# obtain the ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f}'.format(y_label_value)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
# improve plot aesthetics
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nRental traffic', size = 14)
g.set_ylabels('Station count\n', size = 14)
# add annotations
for ax in g.axes.ravel(): # this will loop over the different figures in the grid
for p in ax.patches: # this will loop over the different bars in each figure
ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize=10)
plt.subplots_adjust(top=0.85)
plt.subplots_adjust(wspace=0.1, hspace=0.2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.5.e Facet grid classification of Start stations Rental traffic based on bike type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('GnBu', n_colors=5, desat=0.6)
plt.figure(figsize = [12, 4])
# plot clustered bar chart
g = sb.countplot(data = df, x = 'bike_type', hue = 'traffic', alpha = 0.8, saturation = 1)
# improve plot aesthetics
plt.title('Distribution of Start station traffic over bike type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nBike Rental traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(top=True, left=True, right=True, bottom=False)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.9), loc = 'upper left', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
separators = [0.5, 1.5, 2.5, 3.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.5.f Distribution of Start stations Rental traffic over bike type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'mediumturquoise', 'turquoise', 'paleturquoise']
# sb.set_palette(flatui, n_colors=4, desat=0.6)
sb.set_palette('deep', n_colors=4, desat=0.6)
plt.figure(figsize = [12, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'traffic', hue = 'bike_type', alpha = 0.8, saturation = 1)
# improve plot aesthetics
plt.title('Classification of Start station traffic based on bike type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\n Rental traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.9), loc = 6, labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
separators = [0.5, 1.5, 2.5, 3.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.5.g Classification of Start stations Rental traffic over bike type.png', dpi=300, bbox_inches='tight')
start_station_id and pass_type columns:¶Column: start_station_id, start_station_idData type: numerical data, continuousPlot : Scatter plotDistribution of start_stations rental traffic based on pass_type.
start_stations = bikeshare.groupby([bikeshare['start_station_id'],
bikeshare['pass_type']]).size().reset_index(name='rentals')
start_stations.head()
calculate max value of the count to estimate bin size of the preceeding plot
start_stations['rentals'].max()
As the max value is around
25000, let the bin size be500.
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('GnBu_d', n_colors=5, desat=0.6)
# prepare data for plotting
max_value = start_stations['rentals'].max()
bin_edges = np.arange(0, max_value+500, 500)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = start_stations, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1, hue = 'pass_type')
g.map(plt.hist, "rentals", bins = bin_edges)
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of Start stations Rental traffic based on pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_max = (max_value/1000)
x_tick_locs = np.arange(0, x_tick_max+1, 1)
x_tick_names = ['{:0.0f}K'.format(loc) for loc in x_tick_locs]
g.set_xticklabels(x_tick_names, size=12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike rentals', size = 14)
g.set_ylabels('Station count\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.6.a Distribution of Start stations Rental traffic based on pass type.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto the start station's bike rental data.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('GnBu_d', n_colors=5, desat=0.6)
# prepare data for plotting
start_stations['log_count'] = start_stations['rentals'].apply(log_trans)
min_value = log_trans(start_stations['rentals'].min())
max_value = log_trans(start_stations['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = start_stations, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1, hue = 'pass_type')
g.map(plt.hist, "log_count", bins = bin_edges)
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Logarithmic distribution of Start stations Rental traffic', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_names = log_trans(tick_locs, inverse = True).astype(int)
g.set_xticklabels(x_tick_names, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike rentals', size = 14)
g.set_ylabels('Station count\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.6.b Logarithmic distribution of Start stations Rental traffic based on pass type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', n_colors = 5, desat = 0.8)
max_value = log_trans(start_stations['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
sb.distplot(start_stations[start_stations['pass_type'] == "Walk-up"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[0], label = 'Walk-up', hist=False)
sb.distplot(start_stations[start_stations['pass_type'] == "One Day"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[1], label = 'One Day', hist=False)
sb.distplot(start_stations[start_stations['pass_type'] == "Monthly"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[2], label = 'Monthly', hist=False)
sb.distplot(start_stations[start_stations['pass_type'] == "Flex"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[3], label = 'Flex', hist=False)
sb.distplot(start_stations[start_stations['pass_type'] == "Annual"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[4], label = 'Annual', hist=False)
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nBike rentals', fontsize = 14)
plt.ylabel('Kernal Density Estimation\n', fontsize = 14)
plt.title('Kernal Density Estimation of Start stations Rental traffic\n', fontsize = 16, weight = 'bold')
plt.legend(scatterpoints=1,frameon=True, fancybox=True, loc = 'upper right',
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.6.c Kernal Density Estimation of Start stations Rental traffic based on pass type.png', dpi=300, bbox_inches='tight')
bin = [0,10,100,1000,10000,100000]
#use pd.cut function can attribute the values into its specific bins
category = pd.cut(start_stations['rentals'],bin)
category = category.to_frame()
category.columns = ['rental_bins']
category['pass_type'] = start_stations['pass_type']
category['start_station_id'] = start_stations['start_station_id']
category = category.reindex(columns=['start_station_id', 'pass_type', 'rental_bins'])
category.head(10)
category.rental_bins.sort_values(ascending=True).unique()
%%time
def label_race(df):
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[0],'traffic'] = 'Very Low'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[1],'traffic'] = 'Low'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[2],'traffic'] = 'Normal'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[3],'traffic'] = 'High'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[4],'traffic'] = 'Very High'
df = category
label_race(df)
level_order = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['traffic'] = df['traffic'].astype(ordered_cat)
category.traffic.value_counts()
def plot_pie(cat_type):
df = category[category['pass_type'] == cat_type]
subdf = df.groupby([df['traffic']]).size().reset_index(name='stations')
df_to_plot = subdf.loc[lambda df_to_plot: subdf['stations'] != 0]
type_level_counts = df_to_plot.stations
type_level_index = df_to_plot.traffic.sort_values(ascending=True).unique()
clrs = [sb.color_palette()[3],
sb.color_palette()[0],
sb.color_palette()[0],
sb.color_palette()[0],
sb.color_palette()[0]]
explode = [0.2, 0, 0, 0, 0]
plt.pie(type_level_counts, labels = type_level_index, startangle = 90,
counterclock = False, wedgeprops = {'width' : 0.4}, shadow=False,
explode=explode[0:len(type_level_index)], colors=clrs[0:len(type_level_index)],
textprops={'fontsize': 12}, autopct='%1.0f%%', labeldistance=1.1, pctdistance=0.8)
plt.title(cat_type+ '\n', weight='bold', color='grey', fontsize=14)
plt.axis('square');
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'mediumturquoise', 'turquoise', 'paleturquoise']
sb.set_palette(flatui, n_colors=4, desat=0.6)
Ncount = len(category.pass_type.unique())
types = category.pass_type.unique()
plt.figure(figsize = [12, 8])
# subplot
plt.subplot(2, 3, 1)
plot_pie('Walk-up')
# subplot
plt.subplot(2, 3, 2)
plot_pie('One Day')
# subplot
plt.subplot(2, 3, 3)
plot_pie('Monthly')
# subplot
plt.subplot(2, 3, 4)
plot_pie('Flex')
# subplot
plt.subplot(2, 3, 5)
plot_pie('Annual')
plt.suptitle('Classification of Start station traffic based on pass type', fontsize = 16, weight = 'bold')
plt.subplots_adjust(top=0.8)
plt.subplots_adjust(wspace=0.4, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.6.d Pie chart classification of Start stations Rental traffic based on pass type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('GnBu_d', n_colors=5, desat=0.6)
# plot Facet Grid
g = sb.FacetGrid(data = category, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1, hue = 'pass_type')
g.map(sb.countplot, 'traffic', order = category.traffic.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Start station rentals traffic based on pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'grey')
# obtain the ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f}'.format(y_label_value)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
# improve plot aesthetics
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nRental traffic', size = 14)
g.set_ylabels('Start stations\n', size = 14)
g.add_legend(bbox_to_anchor=(0.8, 0.25), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Pass type', title_fontsize=14, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
# add annotations
for ax in g.axes.ravel(): #this will loop over the different figures in the grid
for p in ax.patches: #this will loop over the different bars in each figure
ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.subplots_adjust(top=0.85)
plt.subplots_adjust(wspace=0.1, hspace=0.2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.6.e Facet grid classification of Start stations Rental traffic based on pass type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('GnBu', n_colors=5, desat=0.6)
plt.figure(figsize = [12, 4])
# plot clustered bar chart
g = sb.countplot(data = df, x = 'pass_type', hue = 'traffic', alpha = 0.8, saturation = 1)
# improve plot aesthetics
plt.title('Distribution of Start station traffic over pass type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nBike Rental traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(top=True, left=True, right=True, bottom=False)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.9), loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
separators = [0.5, 1.5, 2.5, 3.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.6.f Distribution of Start stations Rental traffic based on pass type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui)
plt.figure(figsize = [12, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'traffic', hue = 'pass_type', alpha = 0.8, saturation = 1)
# improve plot aesthetics
plt.title('Classification of Start station traffic based on pass type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\n Rental traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(top=True, left=True, right=True, bottom=False)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (0.9, 0.8), loc = 6, labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
separators = [0.5, 1.5, 2.5, 3.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.6.g Classification of Start stations Rental traffic based on pass type.png', dpi=300, bbox_inches='tight')
start_station_id and fare_type columns:¶Column: start_station_id, start_station_idData type: numerical data, continuousPlot : Scatter plotDistribution of start_stations rental traffic based on fare_type.
start_stations = bikeshare.groupby([bikeshare['start_station_id'],
bikeshare['fare_type']]).size().reset_index(name='rentals')
start_stations.head()
calculate max value of the count to estimate bin size of the preceeding plot
start_stations['rentals'].max()
As the max value is around
32000, let the bin size be1000.
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'turquoise']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# prepare data for plotting
max_value = start_stations['rentals'].max()
bin_edges = np.arange(0, max_value+500, 500)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = start_stations, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1.2, hue = 'fare_type')
g.map(plt.hist, "rentals", bins = bin_edges)
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Distribution of Start stations Rental traffic based on fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_max = (max_value/1000)
x_tick_locs = np.arange(0, x_tick_max+1, 1)
x_tick_names = ['{:0.0f}K'.format(loc) for loc in x_tick_locs]
g.set_xticklabels(x_tick_names, size=12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike rentals (thousands)', size = 14)
g.set_ylabels('Station count\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.7.a Distribution of Start stations Rental traffic based on fare type.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto the start station's bike rental data.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'turquoise']
sb.set_palette(flatui, desat = 0.6)
# prepare data for plotting
start_stations['log_count'] = start_stations['rentals'].apply(log_trans)
min_value = log_trans(start_stations['rentals'].min())
max_value = log_trans(start_stations['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = start_stations, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1, hue = 'fare_type')
g.map(plt.hist, "log_count", bins = bin_edges)
g.fig.subplots_adjust(top=0.7)
g.fig.suptitle('Logarithmic distribution of Start stations Rental traffic based on fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_names = log_trans(tick_locs, inverse = True).astype(int)
g.set_xticklabels(x_tick_names, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike rentals', size = 14)
g.set_ylabels('Station count\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.7.b Logarithmic distribution of Start stations Rental traffic based on fare type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', desat = 0.8)
min_value = log_trans(start_stations[start_stations['fare_type'] == "Base"]['rentals'].min())
max_value = log_trans(start_stations[start_stations['fare_type'] == "Extended"]['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
plt.hist(start_stations[start_stations['fare_type'] == "Base"]['rentals'].apply(log_trans),
bins = bin_edges, color = sb.color_palette()[0], alpha=0.6, label = 'Base')
plt.hist(start_stations[start_stations['fare_type'] == "Extended"]['rentals'].apply(log_trans),
bins = bin_edges, color = sb.color_palette()[1], alpha=0.6, label = 'Extended')
x_tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(x_tick_locs, log_trans(x_tick_locs, inverse = True).astype(int), fontsize = 12)
# get the current tick locations and labels
y_locs, y_labels = plt.yticks()
y_tick_locs = np.arange(0, int(max(y_locs))+5, 5)
y_tick_names = ['{:0.0f}'.format(loc) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize = 12)
plt.xlabel('\nBike rentals', fontsize = 14)
plt.ylabel('Station count\n', fontsize = 14)
plt.title('Logarithmic distribution of Start stations Rental traffic\n', fontsize = 16, weight = 'bold')
plt.legend(bbox_to_anchor=(1.4, 1), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, loc = 'upper right');
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.7.c Logarithmic distribution of Start stations Rental traffic based on fare type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', desat = 0.8)
min_value = log_trans(start_stations[start_stations['fare_type'] == "Base"]['rentals'].min())
max_value = log_trans(start_stations[start_stations['fare_type'] == "Extended"]['rentals'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
sb.distplot(start_stations[start_stations['fare_type'] == "Base"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8, "shade": True},
color = sb.color_palette()[0], label = 'Base', hist=False)
sb.distplot(start_stations[start_stations['fare_type'] == "Extended"]['rentals'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8, "shade": True},
color = sb.color_palette()[1], label = 'Extended', hist=False)
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nBike rentals', fontsize = 14)
plt.ylabel('Kernal Density Estimation\n', fontsize = 14)
plt.title('Kernal Density Estimation of Start stations Rental traffic\n', fontsize = 16, weight = 'bold')
plt.legend(bbox_to_anchor=(1.3, 1), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Fare type', title_fontsize=14, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, loc = 'upper right');
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.7.d Kernal Density Estimation of Start stations Rental traffic based on fare type.png', dpi=300, bbox_inches='tight')
bin = [0,10,100,1000,10000,100000]
#use pd.cut function can attribute the values into its specific bins
category = pd.cut(start_stations['rentals'],bin)
category = category.to_frame()
category.columns = ['rental_bins']
category['fare_type'] = start_stations['fare_type']
category['start_station_id'] = start_stations['start_station_id']
category.reindex(columns=['start_station_id', 'fare_type', 'rental_bins'])
category.head()
df.rental_bins.sort_values(ascending=True).unique()
%%time
def label_race(df):
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[0],'traffic'] = 'Very Low'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[1],'traffic'] = 'Low'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[2],'traffic'] = 'Normal'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[3],'traffic'] = 'High'
df.loc[df['rental_bins'] == df.rental_bins.sort_values(ascending=True).unique()[4],'traffic'] = 'Very High'
df = category
label_race(df)
level_order = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['traffic'] = df['traffic'].astype(ordered_cat)
category.traffic.value_counts()
def plot_pie(cat_type):
df = category[category['fare_type'] == cat_type]
subdf = df.groupby([df['traffic']]).size().reset_index(name='stations')
df_to_plot = subdf.loc[lambda df_to_plot: subdf['stations'] != 0]
type_level_counts = df_to_plot.stations
type_level_index = df_to_plot.traffic.sort_values(ascending=True).unique()
clrs = [ sb.color_palette()[1],
sb.color_palette()[0],
sb.color_palette()[0],
sb.color_palette()[0],
sb.color_palette()[0] ]
explode = [ 0.2, 0, 0, 0, 0 ]
p, tx, autotexts = plt.pie(type_level_counts, labels = type_level_index, startangle = 90,
counterclock = False, wedgeprops = {'width' : 0.4}, shadow=False,
explode=explode[0:len(type_level_counts)], colors=clrs[0:len(type_level_counts)],
textprops={'fontsize': 14}, autopct='', labeldistance=1.1, pctdistance=0.8)
plt.title(cat_type+ '\n\n\n', weight='bold', color='grey', fontsize=14)
plt.axis('square');
# add annotations
for i, a in enumerate(autotexts):
a.set_text("{}".format(type_level_counts[i]))
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'paleturquoise']
sb.set_palette(flatui, desat = 0.6)
Ncount = len(category.fare_type.unique())
types = category.fare_type.unique()
plt.figure(figsize = [12, 6])
# subplot
plt.subplot(1, 2, 1)
plot_pie('Base')
# subplot
plt.subplot(1, 2, 2)
plot_pie('Extended')
plt.suptitle('Classification of Start station traffic based on fare type', fontsize = 16, weight = 'bold')
plt.subplots_adjust(top=0.7)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.7.e Pie chart classification of Start stations Rental traffic based on fare type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'turquoise']
sb.set_palette(flatui, desat = 0.6)
plt.figure(figsize = [6, 4])
# plot Facet Grid
g = sb.FacetGrid(data = category, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1, hue = 'fare_type')
g.map(sb.countplot, 'traffic', order = category.traffic.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Start station rentals traffic based on fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}\n', weight = 'bold', size = 14, color = 'grey')
# improve plot aesthetics
g.set_yticklabels(size = 10)
g.set_xticklabels(size = 10)
g.set_xlabels('\nRental traffic', size = 13)
g.set_ylabels('Start stations\n', size = 13)
g.add_legend(bbox_to_anchor=(1.05, 0.7), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Fare type', title_fontsize=14, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
plt.subplots_adjust(top=0.7)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# add annotations
# -------------------------------------------------------
#loop over the different figures in the grid
for ax in g.axes.ravel():
# loop over the different bars in each figure
for p in ax.patches:
ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points');
# -------------------------------------------------------
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.7.f Facet grid classification of Start stations Rental traffic based on fare type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darkturquoise', 'paleturquoise']
sb.set_palette('GnBu', desat = 0.6)
plt.figure(figsize = [6, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'fare_type', hue = 'traffic', alpha = 0.8, saturation = 0.8)
# improve plot aesthetics
plt.title('Classification of Start station traffic based on fare type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nRental traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# -------------------------------------------------------
separators = [0.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.7.g Distribution of Start stations Rental traffic based on fare type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#fcd605', '#fae887']
# flatui = ['#e3ba40', '#faf441']
# flatui = ['#6845bf', '#c3abff']
sb.set_palette(flatui, desat = 0.6)
plt.figure(figsize = [6, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'traffic', hue = 'fare_type', alpha = 0.8, saturation = 0.8)
# improve plot aesthetics
plt.title('Classification of Start station traffic based on fare type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nRental traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (0.8, 0.9), loc = 6, labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# -------------------------------------------------------
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.7.h Classification of Start stations Rental traffic based on fare type.png', dpi=300, bbox_inches='tight')
end_lat and end_lon columns:¶Column: end_lat, end_lonData type: numerical data, continuousPlot : Scatter plotsb.set_style('white')
sb.set_palette(palette = "deep", n_colors = 20, desat = None)
sb.regplot(data = bikeshare, x = 'end_lat', y = 'end_lon',
fit_reg = False, scatter_kws = {'alpha' : 1/5})
plt.title('End station geo-locations', fontsize = 14, weight = 'bold')
plt.xlabel('End station latitude', fontsize = 12)
plt.ylabel('End station longitude', fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.8.a End station geo-locations.png', dpi=300, bbox_inches='tight')
bikeshare[['end_lat', 'end_lon']].describe()
x_bins = np.arange(bikeshare.end_lat.min(), bikeshare.end_lat.max()+0.01, 0.01)
y_bins = np.arange(bikeshare.end_lon.min(), bikeshare.end_lon.max()+0.01, 0.01)
plt.hist2d(data = bikeshare, x = 'end_lat', y = 'end_lon',
cmin = 0.5, cmap = 'viridis_r', bins = [x_bins, y_bins])
plt.title('End station geo-distribution', fontsize = 14, weight = 'bold')
plt.xlabel('End station latitude', fontsize = 12)
plt.ylabel('End station longitude', fontsize = 12)
plt.colorbar();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.8.b End station geo-distribution.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [8, 4])
h2d = plt.hist2d(data = bikeshare, x = 'end_lat', y = 'end_lon', cmin = 0.5, cmap = 'viridis_r')
plt.title('End station geo-distribution', fontsize = 14, weight = 'bold')
plt.xlabel('End station latitude', fontsize = 12)
plt.ylabel('End station longitude', fontsize = 12)
# getting individual elements
counts = h2d[0]
x_bins = h2d[1]
y_bins = h2d[2]
counts_list = []
x_bin_diff_list = []
y_bin_diff_list = []
for i in range(counts.shape[0]):
for j in range(counts.shape[1]):
c = counts[i,j]
# eliminate nan and append only if c does not exist in counts_list
if c not in counts_list and not np.isnan(c):
counts_list.append(c)
for bin in range(len(x_bins)-1):
x_bin_diff = x_bins[bin+1] - x_bins[bin]
if x_bin_diff not in x_bin_diff_list:
x_bin_diff_list.append(x_bin_diff)
for bin in range(len(y_bins)-1):
y_bin_diff = y_bins[bin+1] - y_bins[bin]
if y_bin_diff not in y_bin_diff_list:
y_bin_diff_list.append(y_bin_diff)
counts_mean = np.mean(counts_list)
x_bin_size = max(x_bin_diff_list)
y_bin_size = max(y_bin_diff_list)
for i in range(counts.shape[0]):
for j in range(counts.shape[1]):
c = counts[i,j]
if c >= counts_mean: # increase visibility on darkest cells
plt.text(x_bins[i] + (x_bin_size/2), y_bins[j] + (y_bin_size/2), int(c),
ha = 'center', va = 'center', color = 'white', fontsize = 9)
elif c > 0:
plt.text(x_bins[i] + (x_bin_size/2), y_bins[j] + (y_bin_size/2), int(c),
ha = 'center', va = 'center', color = 'black', fontsize = 9)
plt.colorbar();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.8.c End station geo-distribution.png', dpi=300, bbox_inches='tight')
end_station_id and trip_type columns:¶Column: start_station_id, start_station_idData type: numerical data, continuousPlot : Scatter plotDistribution of end_stations rental traffic based on trip_type.
end_stations = bikeshare.groupby([bikeshare['end_station_id'],
bikeshare['trip_type']]).size().reset_index(name='returns')
end_stations.head()
calculate max value of the count to estimate bin size of the preceeding plot
end_stations['returns'].describe()
As the max value is around
36000, let the bin size be1000.
# Assign palette as per requirement
sb.set_style('white')
flatui = ['salmon', 'lightsalmon']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# prepare data for plotting
max_value = end_stations['returns'].max()
bin_edges = np.arange(0, max_value+500, 500)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = end_stations, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1.2, hue = 'trip_type')
g.map(plt.hist, "returns", bins = bin_edges)
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Distribution of End stations Return traffic based on trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_max = (max_value/1000)
x_tick_locs = np.arange(0, x_tick_max+1, 1)
x_tick_names = ['{:0.0f}K'.format(loc) for loc in x_tick_locs]
g.set_xticklabels(x_tick_names, size=12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike returns', size = 14)
g.set_ylabels('End Station count\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.9.a Distribution of End stations Return traffic based on trip type.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto the start station's bike rental data.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
flatui = ['salmon', 'lightsalmon']
sb.set_palette(flatui, desat = 0.6)
# prepare data for plotting
end_stations['log_count'] = end_stations['returns'].apply(log_trans)
min_value = log_trans(end_stations['returns'].min())
max_value = log_trans(end_stations['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = end_stations, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, hue = 'trip_type')
g.map(plt.hist, "log_count", bins = bin_edges)
g.fig.subplots_adjust(top=0.7)
g.fig.suptitle('Logarithmic distribution of End stations Return traffic\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_names = log_trans(tick_locs, inverse = True).astype(int)
g.set_xticklabels(x_tick_names, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike returns', size = 14)
g.set_ylabels('End Station count\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.9.b Logarithmic distribution of End stations Return traffic based on trip type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', desat = 0.8)
min_value = log_trans(end_stations[end_stations['trip_type'] == "One Way"]['returns'].min())
max_value = log_trans(end_stations[end_stations['trip_type'] == "One Way"]['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
plt.hist(end_stations[end_stations['trip_type'] == "One Way"]['returns'].apply(log_trans),
bins = bin_edges, color = sb.color_palette()[0], alpha=0.6, label = 'One Way')
plt.hist(end_stations[end_stations['trip_type'] == "Round Trip"]['returns'].apply(log_trans),
bins = bin_edges, color = sb.color_palette()[1], alpha=0.6, label = 'Round Trip')
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nBike returns', fontsize = 14)
plt.ylabel('End Station count\n', fontsize = 14)
plt.title('Logarithmic distribution of end stations Return traffic\n', fontsize = 16, weight = 'bold')
plt.legend(bbox_to_anchor=(1.3, 1), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, loc = 'upper right')
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.9.c Logarithmic distribution of End stations Return traffic based on trip type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', desat = 0.8)
min_value = log_trans(end_stations[end_stations['trip_type'] == "One Way"]['returns'].min())
max_value = log_trans(end_stations[end_stations['trip_type'] == "One Way"]['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
sb.distplot(end_stations[end_stations['trip_type'] == "One Way"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8, "shade": True},
color = sb.color_palette()[0], label = 'One Way', hist=False)
sb.distplot(end_stations[end_stations['trip_type'] == "Round Trip"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8, "shade": True},
color = sb.color_palette()[1], label = 'Round Trip', hist=False)
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nBike returns', fontsize = 14)
plt.ylabel('Kernal Density Estimation\n', fontsize = 14)
plt.title('Logarithmic distribution of End stations Return traffic\n', fontsize = 16, weight = 'bold')
plt.legend(bbox_to_anchor=(1.3, 1), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, loc = 'upper right');
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.9.d Kernal Density Estimation of End stations Return traffic based on trip type.png', dpi=300, bbox_inches='tight')
bin = [0,10,100,1000,10000,100000]
#use pd.cut function can attribute the values into its specific bins
category = pd.cut(end_stations['returns'],bin)
category = category.to_frame()
category.columns = ['return_bins']
category['trip_type'] = end_stations['trip_type']
category['end_station_id'] = end_stations['end_station_id']
category = category.reindex(columns=['end_station_id', 'trip_type', 'return_bins'])
category.head()
category.return_bins.sort_values(ascending=True).unique()
%%time
def assign_traffic(df):
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[0],'traffic'] = 'Very Low'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[1],'traffic'] = 'Low'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[2],'traffic'] = 'Normal'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[3],'traffic'] = 'High'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[4],'traffic'] = 'Very High'
df = category
assign_traffic(df)
# convert the 'traffic' column to ordered categorical datatype
level_order = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['traffic'] = df['traffic'].astype(ordered_cat)
category.traffic.value_counts()
def plot_pie(cat_type):
df = category[category['trip_type'] == cat_type]
subdf = df.groupby([df['traffic']]).size().reset_index(name='stations')
df_to_plot = subdf.loc[lambda df_to_plot: subdf['stations'] != 0]
type_level_counts = df_to_plot.stations
type_level_index = df_to_plot.traffic.sort_values(ascending=True).unique()
clrs = [ sb.color_palette()[0] if (x > type_level_counts.min()) else sb.color_palette()[1] for x in type_level_counts ]
explode = [ 0 if (x > type_level_counts.min()) else 0.2 for x in type_level_counts ]
plt.pie(type_level_counts, labels = type_level_index, startangle = 90,
counterclock = False, wedgeprops = {'width' : 0.4}, shadow=False,
explode=explode, colors=clrs, textprops={'fontsize': 14},
autopct='%1.0f%%', labeldistance=1.1, pctdistance=0.8)
plt.title(cat_type+ '\n\n', weight='bold', color='grey', fontsize=14)
plt.axis('square');
# Assign palette as per requirement
sb.set_style('white')
flatui = ['salmon', 'bisque']
sb.set_palette(flatui, desat = 0.6)
Ncount = len(category.trip_type.unique())
types = category.trip_type.unique()
plt.figure(figsize = [12, 6])
# subplot
plt.subplot(1, 2, 1)
plot_pie('One Way')
# subplot
plt.subplot(1, 2, 2)
plot_pie('Round Trip')
plt.suptitle('Classification of End station return traffic based on trip type', fontsize = 16, weight = 'bold')
plt.subplots_adjust(top=0.7)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.9.e Pie chart classification of End stations Return traffic based on trip type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['salmon', 'lightsalmon']
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [6, 4])
# plot Facet Grid
g = sb.FacetGrid(data = category, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.countplot, 'traffic', order = category.traffic.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('End station return traffic based on trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}\n', weight = 'bold', size = 14, color = 'grey')
# improve plot aesthetics
g.set_yticklabels(size = 10)
g.set_xticklabels(size = 10)
g.set_xlabels('\nReturn traffic', size = 12)
g.set_ylabels('End stations count\n', size = 12)
g.add_legend(bbox_to_anchor=(1, 0.7), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
plt.subplots_adjust(top=0.7)
plt.subplots_adjust(wspace=0.2, hspace=0.3)
# add annotations
for ax in g.axes.ravel(): #this will loop over the different figures in the grid
for p in ax.patches: #this will loop over the different bars in each figure
ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize=10);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.9.f Facet grid classification of End stations Return traffic based on trip type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['bisque', 'lightsalmon', 'darksalmon', 'salmon', 'tomato']
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [6, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'trip_type', hue = 'traffic', alpha = 0.8, saturation = 0.9)
# improve plot aesthetics
plt.title('Classification of End station traffic based on trip type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nReturn traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# -------------------------------------------------------
separators = [0.5, 1.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.9.g Distribution of End stations Return traffic based on trip type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['salmon', 'bisque']
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [6, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'traffic', hue = 'trip_type', alpha = 0.8, saturation = 0.9)
# improve plot aesthetics
plt.title('Classification of End station traffic based on trip type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nReturn traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (0.8, 0.9), loc = 6, labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.9.h Classification of End stations Return traffic based on trip type.png', dpi=300, bbox_inches='tight')
end_station_id and bike_type columns:¶Column: start_station_id, start_station_idData type: numerical data, continuousPlot : Scatter plotDistribution of end_stations rental traffic based on bike_type.
end_stations = bikeshare.groupby([bikeshare['end_station_id'],
bikeshare['bike_type']]).size().reset_index(name='returns')
end_stations.head()
calculate max value of the count to estimate bin size of the preceeding plot
end_stations['returns'].max()
As the max value is around
20000, let the bin size be500.
# Assign palette as per requirement
sb.set_style('white')
flatui = ['darksalmon', 'tomato', 'salmon', 'lightsalmon']
sb.set_palette(flatui, n_colors=4, desat=0.6)
# prepare data for plotting
max_value = end_stations['returns'].max()
bin_edges = np.arange(0, max_value+500, 500)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = end_stations, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, hue = 'bike_type')
g.map(plt.hist, "returns", bins = bin_edges)
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of End stations Return traffic based on bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_max = (max_value/1000)
x_tick_locs = np.arange(0, x_tick_max+1, 1)
x_tick_names = ['{:0.0f}K'.format(loc) for loc in x_tick_locs]
g.set_xticklabels(x_tick_names, size=12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike returns', size = 14)
g.set_ylabels('End Station count\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.10.a Distribution of End stations Return traffic based on bike type.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto the start station's bike rental data.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
flatui = ['tomato', 'darksalmon', 'salmon', 'lightsalmon']
sb.set_palette(flatui, n_colors=4, desat=0.6)
# prepare data for plotting
end_stations['log_count'] = end_stations['returns'].apply(log_trans)
min_value = log_trans(end_stations['returns'].min())
max_value = log_trans(end_stations['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = end_stations, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, hue = 'bike_type')
g.map(plt.hist, "log_count", bins = bin_edges)
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Logarithmic distribution of end stations Return traffic', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_names = log_trans(tick_locs, inverse = True).astype(int)
g.set_xticklabels(x_tick_names, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike returns', size = 14)
g.set_ylabels('End Station count\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.10.b Logarithmic distribution of End stations Return traffic based on bike type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', n_colors = 4, desat = 0.8)
max_value = log_trans(end_stations['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
sb.distplot(end_stations[end_stations['bike_type'] == "unknown"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[0], label = 'unknown', hist=False)
sb.distplot(end_stations[end_stations['bike_type'] == "Standard"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[1], label = 'Standard', hist=False)
sb.distplot(end_stations[end_stations['bike_type'] == "Electric"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[2], label = 'Electric', hist=False)
sb.distplot(end_stations[end_stations['bike_type'] == "Smart"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[3], label = 'Smart', hist=False)
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nBike returns', fontsize = 14)
plt.ylabel('Kernal Density Estimation\n', fontsize = 14)
plt.title('Logarithmic distribution of End stations Return traffic\n', fontsize = 16, weight = 'bold')
plt.legend(scatterpoints=1,frameon=True, fancybox=True, loc = 'upper left',
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.10.c Kernal Density Estimation of End stations Return traffic based on bike type.png', dpi=300, bbox_inches='tight')
bin = [0,10,100,1000,10000,100000]
#use pd.cut function can attribute the values into its specific bins
category = pd.cut(end_stations['returns'],bin)
category = category.to_frame()
category.columns = ['return_bins']
category['bike_type'] = end_stations['bike_type']
category['end_station_id'] = end_stations['end_station_id']
category = category.reindex(columns=['end_station_id', 'bike_type', 'return_bins'])
category.head()
category.return_bins.sort_values(ascending=True).unique()
%%time
def assign_traffic(df):
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[0],'traffic'] = 'Very Low'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[1],'traffic'] = 'Low'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[2],'traffic'] = 'Normal'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[3],'traffic'] = 'High'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[4],'traffic'] = 'Very High'
df = category
assign_traffic(df)
# convert 'traffic' column to ordered categorical datatype
level_order = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['traffic'] = df['traffic'].astype(ordered_cat)
category.traffic.value_counts()
def plot_pie(cat_type):
df = category[category['bike_type'] == cat_type]
subdf = df.groupby([df['traffic']]).size().reset_index(name='stations')
df_to_plot = subdf.loc[lambda df_to_plot: subdf['stations'] != 0]
type_level_counts = df_to_plot.stations
type_level_index = df_to_plot.traffic.sort_values(ascending=True).unique()
clrs = [ sb.color_palette()[4], sb.color_palette()[2], sb.color_palette()[2], sb.color_palette()[2], sb.color_palette()[2]]
explode = [ 0.2, 0, 0, 0, 0 ]
plt.pie(type_level_counts, labels = type_level_index, startangle = 90,
counterclock = False, wedgeprops = {'width' : 0.4}, shadow=False,
explode=explode[0:len(type_level_index)], colors=clrs[0:len(type_level_index)],
textprops={'fontsize': 12}, autopct='%1.0f%%', labeldistance=1.1, pctdistance=0.8)
plt.title(cat_type+ '\n\n', weight='bold', color='grey', fontsize=14)
plt.axis('square');
# Assign palette as per requirement
sb.set_style('white')
flatui = ['tomato', 'darksalmon', 'salmon', 'lightsalmon', 'bisque']
sb.set_palette(flatui, n_colors=5, desat=0.6)
Ncount = len(category.bike_type.unique())
types = category.bike_type.unique()
plt.figure(figsize = [18, 6])
# subplot
plt.subplot(1, 4, 1)
plot_pie('unknown')
# subplot
plt.subplot(1, 4, 2)
plot_pie('Standard')
# subplot
plt.subplot(1, 4, 3)
plot_pie('Electric')
# subplot
plt.subplot(1, 4, 4)
plot_pie('Smart')
plt.suptitle('Classification of End station traffic based on bike type', fontsize = 16, weight = 'bold')
plt.subplots_adjust(top=0.8)
plt.subplots_adjust(wspace=0.1, hspace=0.2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.10.d Pie chart classification of End stations Return traffic based on bike type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['bisque', 'tomato', 'darksalmon', 'salmon', 'lightsalmon']
sb.set_palette(flatui, n_colors=5, desat=0.6)
# plot Facet Grid
g = sb.FacetGrid(data = category, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, hue = 'bike_type')
g.map(sb.countplot, 'traffic', order = category.traffic.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('End station return traffic based on bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'grey')
# obtain the ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f}'.format(y_label_value)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
# improve plot aesthetics
g.set_yticklabels(y_tick_names, size = 11)
g.set_xticklabels(x_tick_names, size = 11)
g.set_xlabels('\nReturn traffic', size = 13)
g.set_ylabels('End stations count\n', size = 13)
g.add_legend(bbox_to_anchor=(1.05, 0.7), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Bike type', title_fontsize=14, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
# add annotations
for ax in g.axes.ravel(): #this will loop over the different figures in the grid
for p in ax.patches: #this will loop over the different bars in each figure
ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.subplots_adjust(top=0.85)
plt.subplots_adjust(wspace=0.1, hspace=0.2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.10.e Facet grid classification of End stations Return traffic based on bike type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['bisque', 'lightsalmon', 'darksalmon', 'salmon', 'tomato']
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [8, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'bike_type', hue = 'traffic', alpha = 0.8, saturation = 1)
# improve plot aesthetics
plt.title('Classification of End station traffic based on bike type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nReturn traffic', fontsize = 14)
plt.ylabel('End stations count\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (0.8, 0.9), loc = 6, labelspacing=0.5,
title='Return Traffic', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# -------------------------------------------------------
separators = [0.5, 1.5, 2.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.10.f Distribution of End stations Return traffic based on bike type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', n_colors=4, desat=0.6)
plt.figure(figsize = [8, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'traffic', hue = 'bike_type', alpha = 0.8, saturation = 1)
# improve plot aesthetics
plt.title('Classification of End station traffic based on bike type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nReturn traffic', fontsize = 14)
plt.ylabel('End stations count\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (0.9, 0.8), loc = 6, labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# -------------------------------------------------------
separators = [0.5, 1.5, 2.5, 3.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.10.g Classification of End stations Return traffic based on bike type.png', dpi=300, bbox_inches='tight')
end_station_id and pass_type columns:¶Column: start_station_id, start_station_idData type: numerical data, continuousPlot : Scatter plotDistribution of end_stations rental traffic based on pass_type.
end_stations = bikeshare.groupby([bikeshare['end_station_id'],
bikeshare['pass_type']]).size().reset_index(name='returns')
end_stations.head()
calculate max value of the count to estimate bin size of the preceeding plot
end_stations['returns'].max()
As the max value is around
28000, let the bin size be500.
# Assign palette as per requirement
sb.set_style('white')
flatui = ['indianred', 'lightcoral', 'darksalmon', 'salmon', 'lightsalmon']
sb.set_palette(flatui, n_colors=5, desat=0.6)
# prepare data for plotting
max_value = end_stations['returns'].max()
bin_edges = np.arange(0, max_value+500, 500)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = end_stations, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1, hue = 'pass_type')
g.map(plt.hist, "returns", bins = bin_edges)
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of End stations Return traffic based on pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_max = (max_value/1000)
x_tick_locs = np.arange(0, x_tick_max+1, 1)
x_tick_names = ['{:0.0f}K'.format(loc) for loc in x_tick_locs]
g.set_xticklabels(x_tick_names, size=12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike returns', size = 14)
g.set_ylabels('End Station count\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.11.a Distribution of End stations Return traffic based on pass type.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto the start station's bike rental data.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
flatui = ['indianred', 'lightcoral', 'darksalmon', 'salmon', 'lightsalmon']
sb.set_palette(flatui, n_colors=5, desat=0.6)
# prepare data for plotting
end_stations['log_count'] = end_stations['returns'].apply(log_trans)
min_value = log_trans(end_stations['returns'].min())
max_value = log_trans(end_stations['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = end_stations, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1, hue = 'pass_type')
g.map(plt.hist, "log_count", bins = bin_edges)
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Logarithmic distribution of End stations Return traffic', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_names = log_trans(tick_locs, inverse = True).astype(int)
g.set_xticklabels(x_tick_names, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike returns', size = 13)
g.set_ylabels('End Station count\n', size = 13)
plt.subplots_adjust(wspace=0.1, hspace=0.2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.11.b Logarithmic distribution of End stations Return traffic based on pass type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
flatui = ['indianred', 'lightcoral', 'darksalmon', 'salmon', 'lightsalmon']
sb.set_palette('deep', n_colors = 5, desat = 0.8)
max_value = log_trans(end_stations['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
sb.distplot(end_stations[end_stations['pass_type'] == "Walk-up"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[0], label = 'Walk-up', hist=False)
sb.distplot(end_stations[end_stations['pass_type'] == "One Day"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[1], label = 'One Day', hist=False)
sb.distplot(end_stations[end_stations['pass_type'] == "Monthly"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[2], label = 'Monthly', hist=False)
sb.distplot(end_stations[end_stations['pass_type'] == "Flex"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[3], label = 'Flex', hist=False)
sb.distplot(end_stations[end_stations['pass_type'] == "Annual"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8},
color = sb.color_palette()[4], label = 'Annual', hist=False)
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nBike returns', fontsize = 14)
plt.ylabel('Kernal Density Estimation\n', fontsize = 14)
plt.title('Logarithmic distribution of End stations Return traffic\n', fontsize = 16, weight = 'bold')
plt.legend(scatterpoints=1,frameon=True, fancybox=True, loc = 'upper right',
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.11.c Kernal Density Estimation of End stations Return traffic based on pass type.png', dpi=300, bbox_inches='tight')
bin = [0,10,100,1000,10000,100000]
#use pd.cut function can attribute the values into its specific bins
category = pd.cut(end_stations['returns'],bin)
category = category.to_frame()
category.columns = ['return_bins']
category['pass_type'] = end_stations['pass_type']
category['end_station_id'] = end_stations['end_station_id']
category = category.reindex(columns=['end_station_id', 'pass_type', 'return_bins'])
category.head()
category.return_bins.sort_values(ascending=True).unique()
%%time
def assign_traffic(df):
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[0],'traffic'] = 'Very Low'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[1],'traffic'] = 'Low'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[2],'traffic'] = 'Normal'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[3],'traffic'] = 'High'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[4],'traffic'] = 'Very High'
df = category
assign_traffic(df)
# Convert 'traffic' column to ordered categorical datatype
level_order = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['traffic'] = df['traffic'].astype(ordered_cat)
category.traffic.value_counts()
def plot_pie(cat_type):
df = category[category['pass_type'] == cat_type]
subdf = df.groupby([df['traffic']]).size().reset_index(name='stations')
df_to_plot = subdf.loc[lambda df_to_plot: subdf['stations'] != 0]
type_level_counts = df_to_plot.stations
type_level_index = df_to_plot.traffic.sort_values(ascending=True).unique()
clrs = ['bisque', sb.color_palette()[3], sb.color_palette()[3], sb.color_palette()[3], sb.color_palette()[3]]
explode = [0.2, 0, 0, 0, 0]
plt.pie(type_level_counts, labels = type_level_index, startangle = 90,
counterclock = False, wedgeprops = {'width' : 0.4}, shadow=False,
explode=explode[0:len(type_level_index)], colors=clrs[0:len(type_level_index)], textprops={'fontsize': 12},
autopct='%1.0f%%', labeldistance=1.1, pctdistance=0.8)
plt.title(cat_type+ '\n', weight='bold', color='grey', fontsize=14)
plt.axis('square');
# Assign palette as per requirement
sb.set_style('white')
flatui = ['indianred', 'lightcoral', 'darksalmon', 'salmon', 'lightsalmon']
sb.set_palette(flatui, n_colors=5, desat=0.6)
Ncount = len(category.pass_type.unique())
types = category.pass_type.unique()
plt.figure(figsize = [12, 8])
# subplot
plt.subplot(2, 3, 1)
plot_pie('Walk-up')
# subplot
plt.subplot(2, 3, 2)
plot_pie('One Day')
# subplot
plt.subplot(2, 3, 3)
plot_pie('Monthly')
# subplot
plt.subplot(2, 3, 4)
plot_pie('Flex')
# subplot
plt.subplot(2, 3, 5)
plot_pie('Annual')
plt.suptitle('Classification of End station traffic based on pass type', fontsize = 16, weight = 'bold')
plt.subplots_adjust(top=0.85)
plt.subplots_adjust(wspace=0.4, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.11.d Pie chart classification of End stations Return traffic based on pass type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['indianred', 'lightcoral', 'darksalmon', 'salmon', 'lightsalmon']
sb.set_palette(flatui, n_colors=5, desat=0.6)
# plot Facet Grid
g = sb.FacetGrid(data = category, col = 'pass_type', col_wrap = 3, height = 3.5, aspect = 1, hue = 'pass_type')
g.map(sb.countplot, 'traffic', order = category.traffic.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('End station returns traffic based on pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'grey')
# obtain the ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f}'.format(y_label_value)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
# improve plot aesthetics
g.set_yticklabels(y_tick_names, size = 11)
g.set_xticklabels(x_tick_names, size = 11)
g.set_xlabels('\nBike Return traffic', size = 13)
g.set_ylabels('End stations count\n', size = 13)
g.add_legend(bbox_to_anchor=(0.8, 0.25), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Pass type', title_fontsize=14, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
# add annotations
for ax in g.axes.ravel(): #this will loop over the different figures in the grid
for p in ax.patches: #this will loop over the different bars in each figure
ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.subplots_adjust(top=0.85)
plt.subplots_adjust(wspace=0.1, hspace=0.2);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.11.e Facet grid classification of End stations Return traffic based on pass type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['bisque', 'lightsalmon', 'darksalmon', 'salmon', 'tomato']
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [12, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'pass_type', hue = 'traffic', alpha = 0.8, saturation = 1)
# improve plot aesthetics
plt.title('Distribution of End station traffic based on pass type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\n Return traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# -------------------------------------------------------
separators = [0.5, 1.5, 2.5, 3.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.11.f Distribution of End stations Return traffic based on pass type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['indianred', 'lightcoral', 'darksalmon', 'salmon', 'lightsalmon']
sb.set_palette('deep', n_colors=5, desat=0.6)
plt.figure(figsize = [12, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'traffic', hue = 'pass_type', alpha = 0.8, saturation = 1)
# improve plot aesthetics
plt.title('Classification of End station traffic over pass type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\n Return traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 10)
plt.xticks(fontsize = 10)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (0.85, 0.8), loc = 6, labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# -------------------------------------------------------
separators = [0.5, 1.5, 2.5, 3.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.11.g Classification of End stations Return traffic based on pass type.png', dpi=300, bbox_inches='tight')
end_station_id and fare_type columns:¶Column: start_station_id, start_station_idData type: numerical data, continuousPlot : Scatter plotDistribution of end_stations rental traffic based on fare_type.
end_stations = bikeshare.groupby([bikeshare['end_station_id'],
bikeshare['fare_type']]).size().reset_index(name='returns')
end_stations.head()
calculate max value of the count to estimate bin size of the preceeding plot
end_stations['returns'].describe()
As the max value is around
36000, let the bin size be1000.
# Assign palette as per requirement
sb.set_style('white')
flatui = ['salmon', 'lightsalmon']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# prepare data for plotting
max_value = end_stations['returns'].max()
bin_edges = np.arange(0, max_value+500, 500)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = end_stations, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1.2, hue = 'fare_type')
g.map(plt.hist, "returns", bins = bin_edges)
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Distribution of End stations Return traffic based on fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_max = (max_value/1000)
x_tick_locs = np.arange(0, x_tick_max+1, 1)
x_tick_names = ['{:0.0f}K'.format(loc) for loc in x_tick_locs]
g.set_xticklabels(x_tick_names, size=12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nBike returns', size = 14)
g.set_ylabels('End Station count\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.12.a Distribution of End stations Return traffic based on fare type.png', dpi=300, bbox_inches='tight')
This data in their natural units, looks
highly right skewed: lots of points with low values, with a very long tail of data points with large values (also all values arepositive). The most common example of this is, the data that follows an approximatelylog-normal distribution. This is data that, in their natural units, can look highly skewed. However, after applying a logarithmic transform to the data, the data will follow anormal distribution. Hence let us applylogarithmic transformationto the start station's bike rental data.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
flatui = ['salmon', 'lightsalmon']
sb.set_palette(flatui, desat = 0.6)
# prepare data for plotting
end_stations['log_count'] = end_stations['returns'].apply(log_trans)
min_value = log_trans(end_stations['returns'].min())
max_value = log_trans(end_stations['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
tick_locs = np.arange(0, max_value+1, 1)
# plot facet grid
g = sb.FacetGrid(data = end_stations, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1, hue = 'fare_type')
g.map(plt.hist, "log_count", bins = bin_edges)
g.fig.subplots_adjust(top=0.7)
g.fig.suptitle('Logarithmic distribution of End stations Return traffic over fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
x_tick_names = log_trans(tick_locs, inverse = True).astype(int)
g.set_xticklabels(x_tick_names, size = 12)
g.set(yticks=[0, 5, 10, 15, 20])
g.set_xlabels('\nBike returns', size = 14)
g.set_ylabels('End Station count\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.12.b Logarithmic distribution of End stations Return traffic based on fare type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', desat = 0.8)
min_value = log_trans(end_stations[end_stations['fare_type'] == "Extended"]['returns'].min())
max_value = log_trans(end_stations[end_stations['fare_type'] == "Extended"]['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
plt.hist(end_stations[end_stations['fare_type'] == "Base"]['returns'].apply(log_trans),
bins = bin_edges, color = sb.color_palette()[0], alpha=0.6, label = 'Base')
plt.hist(end_stations[end_stations['fare_type'] == "Extended"]['returns'].apply(log_trans),
bins = bin_edges, color = sb.color_palette()[1], alpha=0.6, label = 'Extended')
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nBike returns', fontsize = 14)
plt.ylabel('End Station count\n', fontsize = 14)
plt.title('Logarithmic distribution of end stations Return traffic\n\n', fontsize = 16, weight = 'bold')
# obtain y_ticks and convert them to integers
y_locs, y_labels = plt.yticks()
y_tick_max = int(max(y_locs))
y_tick_locs = np.arange(0, y_tick_max+5, 5)
y_tick_names = ['{:0.0f}'.format(y_loc) for y_loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, size = 12)
plt.legend(bbox_to_anchor=(1.3, 1), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, loc = 'upper right')
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.12.c Logarithmic distribution of End stations Return traffic based on fare type.png', dpi=300, bbox_inches='tight')
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return 10 ** x
# Assign palette as per requirement
sb.set_style('white')
sb.set_palette('deep', desat = 0.8)
min_value = log_trans(end_stations[end_stations['fare_type'] == "Base"]['returns'].min())
max_value = log_trans(end_stations[end_stations['fare_type'] == "Base"]['returns'].max())
bin_edges = np.arange(0, max_value+0.1, 0.1)
sb.distplot(end_stations[end_stations['fare_type'] == "Base"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8, "shade": True},
color = sb.color_palette()[0], label = 'Base', hist=False)
sb.distplot(end_stations[end_stations['fare_type'] == "Extended"]['returns'].apply(log_trans),
bins = bin_edges, kde = True, kde_kws = {'alpha' :0.8, "shade": True},
color = sb.color_palette()[1], label = 'Extended', hist=False)
tick_locs = np.arange(0, max_value+1, 1)
plt.xticks(tick_locs, log_trans(tick_locs, inverse = True).astype(int), fontsize = 12)
plt.yticks(fontsize = 12)
plt.xlabel('\nBike returns', fontsize = 14)
plt.ylabel('Kernal Density Estimation\n', fontsize = 14)
plt.title('Logarithmic distribution of End stations Return traffic\n', fontsize = 16, weight = 'bold')
plt.legend(bbox_to_anchor=(1.3, 1), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, loc = 'upper right');
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.12.d Kernal Density Estimation of End stations Return traffic based on fare type.png', dpi=300, bbox_inches='tight')
bin = [0,10,100,1000,10000,100000]
#use pd.cut function can attribute the values into its specific bins
category = pd.cut(end_stations['returns'],bin)
category = category.to_frame()
category.columns = ['return_bins']
category['fare_type'] = end_stations['fare_type']
category['end_station_id'] = end_stations['end_station_id']
category = category.reindex(columns=['end_station_id', 'fare_type', 'return_bins'])
category.head()
category.return_bins.sort_values(ascending=True).unique()
%%time
def assign_traffic(df):
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[0],'traffic'] = 'Very Low'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[1],'traffic'] = 'Low'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[2],'traffic'] = 'Normal'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[3],'traffic'] = 'High'
df.loc[df['return_bins'] == df.return_bins.sort_values(ascending=True).unique()[4],'traffic'] = 'Very High'
df = category
assign_traffic(df)
# convert the 'traffic' column to ordered categorical datatype
level_order = ['Very Low', 'Low', 'Normal', 'High', 'Very High']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['traffic'] = df['traffic'].astype(ordered_cat)
category.traffic.value_counts()
def plot_pie(cat_type):
df = category[category['fare_type'] == cat_type]
subdf = df.groupby([df['traffic']]).size().reset_index(name='stations')
df_to_plot = subdf.loc[lambda df_to_plot: subdf['stations'] != 0]
type_level_counts = df_to_plot.stations
type_level_index = df_to_plot.traffic.sort_values(ascending=True).unique()
clrs = [sb.color_palette()[1],
sb.color_palette()[0],
sb.color_palette()[0],
sb.color_palette()[0],
sb.color_palette()[0]]
explode = [ 0.2, 0, 0, 0, 0 ]
plt.pie(type_level_counts, labels = type_level_index, startangle = 90,
counterclock = False, wedgeprops = {'width' : 0.4}, shadow=False,
explode=explode[0:len(type_level_index)], colors=clrs[0:len(type_level_index)],
textprops={'fontsize': 14}, autopct='%1.0f%%', labeldistance=1.1, pctdistance=0.8)
plt.title(cat_type+ '\n\n', weight='bold', color='grey', fontsize=14)
plt.axis('square');
# Assign palette as per requirement
sb.set_style('white')
flatui = ['salmon', 'bisque']
sb.set_palette(flatui, desat = 0.6)
Ncount = len(category.fare_type.unique())
types = category.fare_type.unique()
plt.figure(figsize = [12, 6])
# subplot
plt.subplot(1, 2, 1)
plot_pie('Base')
# subplot
plt.subplot(1, 2, 2)
plot_pie('Extended')
plt.suptitle('Classification of End station return traffic based on fare type', fontsize = 16, weight = 'bold')
plt.subplots_adjust(top=0.7)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.12.e Pie chart classification of End stations Return traffic based on fare type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['salmon', 'lightsalmon']
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [6, 4])
# plot Facet Grid
g = sb.FacetGrid(data = category, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1, hue = 'fare_type')
g.map(sb.countplot, 'traffic', order = category.traffic.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('End station return traffic based on fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}\n', weight = 'bold', size = 14, color = 'grey')
# improve plot aesthetics
g.set_yticklabels(size = 10)
g.set_xticklabels(size = 10)
g.set_xlabels('\nReturn traffic', size = 12)
g.set_ylabels('End stations count\n', size = 12)
g.add_legend(bbox_to_anchor=(1, 0.7), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
plt.subplots_adjust(top=0.7)
plt.subplots_adjust(wspace=0.2, hspace=0.3)
# add annotations
for ax in g.axes.ravel(): #this will loop over the different figures in the grid
for p in ax.patches: #this will loop over the different bars in each figure
ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize=10);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.12.f Facet grid classification of End stations Return traffic based on fare type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['bisque', 'lightsalmon', 'darksalmon', 'salmon', 'tomato']
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [6, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'fare_type', hue = 'traffic', alpha = 0.8, saturation = 0.9)
# improve plot aesthetics
plt.title('Classification of End station traffic based on fare type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nReturn traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# -------------------------------------------------------
separators = [0.5, 1.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.12.g Distribution of End stations Return traffic based on fare type.png', dpi=300, bbox_inches='tight')
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc2e88', '#fa98d0']
# flatui = ['#fcd605', '#fae887']
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [6, 4])
# plot clustered bar chart
g = sb.countplot(data = category, x = 'traffic', hue = 'fare_type', alpha = 0.8, saturation = 0.9)
# improve plot aesthetics
plt.title('Classification of End station traffic based on fare type\n\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nReturn traffic', fontsize = 14)
plt.ylabel('Number of stations\n', fontsize = 14)
plt.yticks([], fontsize = 12)
plt.xticks(fontsize = 12)
sb.despine(left=True)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (0.8, 0.9), loc = 6, labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.12.h Classification of End stations Return traffic based on fare type.png', dpi=300, bbox_inches='tight')
trip_type and bike_type columns:¶Columns: bike_type, trip_typeData type: Categorical, nominalPlot : Clustered Bar chart, HeatmapClustered Bar chart:
To depict the relationship between two categorical variables, we can extend the univariate bar chart into a
clustered bar chart. In a clustered bar chart, bars are organized intoclustersbased on levels of thefirst variable, and then bars areorderedconsistently across thesecond variablewithin eachcluster.
# set the palette as per requirement
sb.set_palette(palette = "deep", n_colors = 5, desat = 0.6)
current_palette = sb.color_palette()
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['trip_type']]).count()[['trip_id']].reset_index().rename(columns=
{'trip_id':'count'})['count'].max()
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
sb.countplot(data = bikeshare, x = 'bike_type', hue = 'trip_type', saturation = 0.8)
plt.title('Distribution of rentals based on bike type and trip type\n\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nBike Type', fontsize=14)
plt.ylabel('Rentals (thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.13.a Distribution of rentals based on bike type and trip type.png', dpi=300, bbox_inches='tight')
Facet Grid:
One general visualization technique that will be useful for you to know about to handle plots of two or more variables is
faceting. In faceting, the data is divided into disjoint subsets, most often by different levels of a categorical variable. For each of these subsets of the data, the same plot type is rendered on other variables. Faceting is a way of comparing distributions or relationships across levels of additional variables, especially when there are three or more variables of interest overall.
sb.set_style('white')
sb.set_palette(palette = "deep", n_colors = 5, desat = 0.8)
max_count = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['trip_type']]).count()[['trip_id']].reset_index().rename(columns=
{'trip_id':'count'})['count'].max()
# plot Facet Grid
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.countplot, 'bike_type', order=bikeshare['bike_type'].value_counts().index)
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of rentals based on trip type and bike type\n\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'grey')
# obtain y_ticks and convert them to a multiple of thousands
y_tick_names = []
for ax in g.axes.flat:
for label in ax.get_yticklabels():
label_value = int(label.get_text())
label_value = '{:0.0f} K'.format(label_value/1000)
y_tick_names.append(label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(size = 12)
g.set_xlabels('\nBike Type', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
g.add_legend(bbox_to_anchor=(1.05, 0.7), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Trip type', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.13.b Facet Grid of rentals based on bike type and trip type.png', dpi=300, bbox_inches='tight')
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
The data needs some summirization by grouping together the respective variables.
bikeshare.groupby(['bike_type', 'trip_type']).size().reset_index(name='count')[['count']].sum()
categorical_counts = bikeshare.groupby(['bike_type', 'trip_type']).size()
categorical_counts
Now, lets reset the index and name the summerized data values accordingly.
categorical_counts = categorical_counts.reset_index(name = 'count')
categorical_counts
Now lets make a pivot from the original dataset into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'bike_type', columns = 'trip_type', values = 'count')
categorical_counts
The data is ready to plot the
Heat map.
# plot heat map
plt.figure(figsize = [6, 6])
res = sb.heatmap(categorical_counts, annot = True, fmt = 'd', annot_kws={'size':14}, linewidths=0.1, cmap="YlGnBu")
plt.title('Rentals based on bike type and trip type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nTrip Type', fontsize=14)
plt.ylabel('Bike Type\n', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(rotation=0, fontsize=12, va="center");
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1 %')
else:
t.set_text('{} %'.format(p));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.13.c Distribution of bike rentals based on bike type and trip type.png', dpi=300, bbox_inches='tight')
trip_type and pass_type columns:¶Columns: pass_type, trip_typeData type: Categorical, nominalPlot: Clustered Bar chart, HeatmapClustered Bar chart:
To depict the relationship between two categorical variables, we can extend the univariate bar chart into a
clustered bar chart. In a clustered bar chart, bars are organized intoclustersbased on levels of thefirst variable, and then bars areorderedconsistently across thesecond variablewithin eachcluster.
# set the palette as per requirement
sb.set_palette(palette = "deep", n_colors = 5, desat = 0.8)
current_palette = sb.color_palette()
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = bikeshare.groupby([bikeshare['pass_type'],
bikeshare['trip_type']]).count()[['trip_id']].reset_index().rename(columns=
{'trip_id':'count'})['count'].max()
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
sb.countplot(data = bikeshare, x = 'pass_type', hue = 'trip_type', saturation = 0.8)
plt.title('Distribution of rentals based on pass type and trip type\n\n', fontsize = 16, weight = 'bold')
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nPass Type', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.14.a Distribution of rentals based on pass type and trip type.png', dpi=300, bbox_inches='tight')
Facet Grid:
One general visualization technique that will be useful for you to know about to handle plots of two or more variables is
faceting. In faceting, the data is divided into disjoint subsets, most often by different levels of a categorical variable. For each of these subsets of the data, the same plot type is rendered on other variables. Faceting is a way of comparing distributions or relationships across levels of additional variables, especially when there are three or more variables of interest overall.
sb.set_style('white')
sb.set_palette(palette = "deep", n_colors = 5, desat = 0.8)
max_count = bikeshare.groupby([bikeshare['pass_type'],
bikeshare['trip_type']]).count()[['trip_id']].reset_index().rename(columns=
{'trip_id':'count'})['count'].max()
# y_tick_values = np.arange(0, max_count + 100000, 100000)
pass_order = bikeshare.pass_type.sort_values(ascending=True).unique()
# plot Facet Grid
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.countplot, 'pass_type', order = pass_order)
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Distribution of rentals based on trip type and pass type\n\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'grey')
# obtain y_ticks and convert them to a multiple of thousands
y_tick_names = []
for ax in g.axes.flat:
for label in ax.get_yticklabels():
label_value = int(label.get_text())
label_value = '{:0.0f} K'.format(label_value/1000)
y_tick_names.append(label_value)
g.set_yticklabels(y_tick_names, size = 11)
g.set_xticklabels(size = 11)
g.set_xlabels('\nPass Type', size = 13)
g.set_ylabels('Rentals (thousands)\n', size = 13)
g.add_legend(bbox_to_anchor=(1, 0.7), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Trip type', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.14.b Facet Grid of rentals based on pass type and trip type.png', dpi=300, bbox_inches='tight')
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
The data needs some summirization by grouping together the respective variables.
categorical_counts = bikeshare.groupby(['pass_type', 'trip_type']).size()
categorical_counts
Now, lets reset the index and name the summerized data values accordingly.
categorical_counts = categorical_counts.reset_index(name = 'count')
categorical_counts
Now lets make a pivot from the original dataset into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'pass_type', columns = 'trip_type', values = 'count')
categorical_counts
The data is ready to plot the
Heat map.
# plot heat map
plt.figure(figsize = [6, 6])
res = sb.heatmap(categorical_counts, annot = True, fmt = 'd', annot_kws={'size':14}, linewidths=0.1, cmap="YlGnBu")
plt.title('Rentals based on pass type and trip type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nTrip Type', fontsize=14)
plt.ylabel('Pass Type\n', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(rotation=0, fontsize=12, va="center")
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1 %')
else:
t.set_text('{} %'.format(p));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.14.c Distribution of rentals based on pass type and trip type.png', dpi=300, bbox_inches='tight')
bike_type and pass_type columns:¶Columns: pass_type, trip_typeData type: Categorical, nominalPlot: Clustered Bar chart, HeatmapClustered Bar chart:
To depict the relationship between two categorical variables, we can extend the univariate bar chart into a
clustered bar chart. In a clustered bar chart, bars are organized intoclustersbased on levels of thefirst variable, and then bars areorderedconsistently across thesecond variablewithin eachcluster.
# set the palette as per requirement
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
current_palette = sb.color_palette()
sb.set_style('white')
plt.figure(figsize = [8, 6])
max_count = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['pass_type']]).count()[['trip_id']].reset_index().rename(columns=
{'trip_id':'count'})['count'].max()
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
sb.countplot(data = bikeshare, x = 'bike_type', hue = 'pass_type', saturation = 1)
plt.title('Distribution of rentals based on bike type and pass type\n', fontsize = 16, weight = 'bold')
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nBike Type', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.15.a Distribution of rentals based on bike type and pass type.png', dpi=300, bbox_inches='tight')
Facet Grid:
One general visualization technique that will be useful for you to know about to handle plots of two or more variables is
faceting. In faceting, the data is divided into disjoint subsets, most often by different levels of a categorical variable. For each of these subsets of the data, the same plot type is rendered on other variables. Faceting is a way of comparing distributions or relationships across levels of additional variables, especially when there are three or more variables of interest overall.
sb.set_style('white')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
max_count = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['pass_type']]).count()[['trip_id']].reset_index().rename(columns=
{'trip_id':'count'})['count'].max()
x_tick_names = ['Electric', 'Smart', 'Standard', 'unknown']
# plot Facet Grid
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1, hue = 'pass_type')
g.map(sb.countplot, 'bike_type')
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of rentals based on bike type and pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'grey')
# obtain y_ticks and convert them to a multiple of thousands
y_tick_names = []
for ax in g.axes.flat:
for label in ax.get_yticklabels():
label_value = int(label.get_text())
label_value = '{:0.0f} K'.format(label_value/1000)
y_tick_names.append(label_value)
g.set_yticklabels(y_tick_names, size = 11)
g.set_xticklabels(x_tick_names, size = 11)
g.set_xlabels('\nBike Type', size = 13)
g.set_ylabels('Rentals (thousands)\n', size = 13)
g.add_legend(bbox_to_anchor=(0.8, 0.25), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Pass type', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.15.b Facet Grid of rentals based on bike type and pass type.png', dpi=300, bbox_inches='tight')
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
The data needs some summirization by grouping together the respective variables.
categorical_counts = bikeshare.groupby(['pass_type', 'bike_type']).size()
categorical_counts
Now, lets reset the index and name the summerized data values accordingly.
categorical_counts = categorical_counts.reset_index(name = 'count')
categorical_counts
Now lets make a pivot from the original dataset into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'pass_type', columns = 'bike_type', values = 'count')
categorical_counts
The data is ready to plot the
Heat map.
# plot heat map
plt.figure(figsize = [6, 6])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':13}, linewidths=0.1, cmap="YlGnBu")
plt.title('Rentals based on pass type and bike type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nBike Type', fontsize=14)
plt.ylabel('Pass Type\n', fontsize=14)
plt.yticks(rotation=0, fontsize=12, va="center")
plt.xticks(fontsize=12);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1 %')
else:
t.set_text('{} %'.format(p));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.15.c Distribution of rentals based on pass type and bike type.png', dpi=300, bbox_inches='tight')
fare_type and trip_type columns:¶Columns: pass_type, trip_typeData type: Categorical, nominalPlot: Clustered Bar chart, HeatmapClustered Bar chart:
To depict the relationship between two categorical variables, we can extend the univariate bar chart into a
clustered bar chart. In a clustered bar chart, bars are organized intoclustersbased on levels of thefirst variable, and then bars areorderedconsistently across thesecond variablewithin eachcluster.
# set the palette as per requirement
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
current_palette = sb.color_palette()
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['trip_type']]).size().reset_index(name='count')['count'].max()
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
sb.countplot(data = bikeshare, x = 'fare_type', hue = 'trip_type', saturation = 1)
plt.title('Distribution of rentals based on fare type and trip type\n\n', fontsize = 16, weight = 'bold')
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nFare Type', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.16.a Distribution of rentals based on bike type and pass type.png', dpi=300, bbox_inches='tight')
Facet Grid:
One general visualization technique that will be useful for you to know about to handle plots of two or more variables is
faceting. In faceting, the data is divided into disjoint subsets, most often by different levels of a categorical variable. For each of these subsets of the data, the same plot type is rendered on other variables. Faceting is a way of comparing distributions or relationships across levels of additional variables, especially when there are three or more variables of interest overall.
sb.set_style('white')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
max_count = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['trip_type']]).size().reset_index(name='count')['count'].max()
# plot Facet Grid
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.countplot, 'fare_type')
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of rentals based on fare type and trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'grey')
# obtain y_ticks and convert them to a multiple of thousands
y_tick_names = []
for ax in g.axes.flat:
for label in ax.get_yticklabels():
label_value = int(label.get_text())
label_value = '{:0.0f} K'.format(label_value/1000)
y_tick_names.append(label_value)
g.set_yticklabels(y_tick_names, size = 11)
g.set_xticklabels(size = 11)
g.set_xlabels('\nFare Type', size = 13)
g.set_ylabels('Rentals (thousands)\n', size = 13)
g.add_legend(bbox_to_anchor=(0.9, 0.7), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Fare type', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.16.b Facet Grid of rentals based on bike type and pass type.png', dpi=300, bbox_inches='tight')
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
The data needs some summirization by grouping together the respective variables.
categorical_counts = bikeshare.groupby(['fare_type', 'trip_type']).size()
categorical_counts
Now, lets reset the index and name the summerized data values accordingly.
categorical_counts = categorical_counts.reset_index(name = 'count')
categorical_counts
Now lets make a pivot from the original dataset into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'trip_type', columns = 'fare_type', values = 'count')
categorical_counts
The data is ready to plot the
Heat map.
# plot heat map
plt.figure(figsize = [4, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':13}, linewidths=0.1, cmap="YlGnBu")
plt.title('Rentals based on fare type and trip type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nFare Type', fontsize=14)
plt.ylabel('Trip Type\n', fontsize=14)
plt.yticks(rotation=0, fontsize=12, va="center")
plt.xticks(fontsize=12);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1 %')
else:
t.set_text('{} %'.format(p));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.16.c Distribution of rentals based on pass type and bike type.png', dpi=300, bbox_inches='tight')
fare_type and pass_type columns:¶Columns: pass_type, trip_typeData type: Categorical, nominalPlot: Clustered Bar chart, HeatmapClustered Bar chart:
To depict the relationship between two categorical variables, we can extend the univariate bar chart into a
clustered bar chart. In a clustered bar chart, bars are organized intoclustersbased on levels of thefirst variable, and then bars areorderedconsistently across thesecond variablewithin eachcluster.
# set the palette as per requirement
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
current_palette = sb.color_palette()
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['pass_type']]).size().reset_index(name='count')['count'].max()
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
sb.countplot(data = bikeshare, x = 'fare_type', hue = 'pass_type', saturation = 1)
plt.title('Distribution of rentals based on fare type and pass type\n\n', fontsize = 16, weight = 'bold')
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nFare Type', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.17.a Distribution of rentals based on bike type and pass type.png', dpi=300, bbox_inches='tight')
Facet Grid:
One general visualization technique that will be useful for you to know about to handle plots of two or more variables is
faceting. In faceting, the data is divided into disjoint subsets, most often by different levels of a categorical variable. For each of these subsets of the data, the same plot type is rendered on other variables. Faceting is a way of comparing distributions or relationships across levels of additional variables, especially when there are three or more variables of interest overall.
sb.set_style('white')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
max_count = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['pass_type']]).size().reset_index(name='count')['count'].max()
# plot Facet Grid
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1, hue = 'pass_type')
g.map(sb.countplot, 'fare_type')
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of rentals based on fare type and pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'grey')
# obtain y_ticks and convert them to a multiple of thousands
y_tick_names = []
for ax in g.axes.flat:
for label in ax.get_yticklabels():
label_value = int(label.get_text())
label_value = '{:0.0f} K'.format(label_value/1000)
y_tick_names.append(label_value)
g.set_yticklabels(y_tick_names, size = 11)
g.set_xticklabels(size = 11)
g.set_xlabels('\nFare Type', size = 13)
g.set_ylabels('Rentals (thousands)\n', size = 13)
g.add_legend(bbox_to_anchor=(0.9, 0.2), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Pass type', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.17.b Facet Grid of rentals based on bike type and pass type.png', dpi=300, bbox_inches='tight')
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
The data needs some summirization by grouping together the respective variables.
categorical_counts = bikeshare.groupby(['fare_type', 'pass_type']).size()
categorical_counts
Now, lets reset the index and name the summerized data values accordingly.
categorical_counts = categorical_counts.reset_index(name = 'count')
categorical_counts
Now lets make a pivot from the original dataset into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'pass_type', columns = 'fare_type', values = 'count')
categorical_counts
The data is ready to plot the
Heat map.
# plot heat map
plt.figure(figsize = [4, 6])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':13}, linewidths=0.1, cmap="YlGnBu")
plt.title('Rentals based on fare type and pass type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nFare Type', fontsize=14)
plt.ylabel('Pass Type\n', fontsize=14)
plt.yticks(rotation=0, fontsize=12, va="center")
plt.xticks(fontsize=12);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1 %')
else:
t.set_text('{} %'.format(p));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.17.c Distribution of rentals based on pass type and bike type.png', dpi=300, bbox_inches='tight')
fare_type and bike_type columns:¶Columns: pass_type, trip_typeData type: Categorical, nominalPlot: Clustered Bar chart, HeatmapClustered Bar chart:
To depict the relationship between two categorical variables, we can extend the univariate bar chart into a
clustered bar chart. In a clustered bar chart, bars are organized intoclustersbased on levels of thefirst variable, and then bars areorderedconsistently across thesecond variablewithin eachcluster.
# set the palette as per requirement
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
current_palette = sb.color_palette()
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['bike_type']]).size().reset_index(name='count')['count'].max()
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
sb.countplot(data = bikeshare, x = 'fare_type', hue = 'bike_type', saturation = 1)
plt.title('Distribution of rentals based on fare type and bike type\n\n', fontsize = 16, weight = 'bold')
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nFare Type', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.18.a Distribution of rentals based on bike type and pass type.png', dpi=300, bbox_inches='tight')
Facet Grid:
One general visualization technique that will be useful for you to know about to handle plots of two or more variables is
faceting. In faceting, the data is divided into disjoint subsets, most often by different levels of a categorical variable. For each of these subsets of the data, the same plot type is rendered on other variables. Faceting is a way of comparing distributions or relationships across levels of additional variables, especially when there are three or more variables of interest overall.
sb.set_style('white')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
max_count = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['bike_type']]).size().reset_index(name='count')['count'].max()
# plot Facet Grid
g = sb.FacetGrid(data = bikeshare, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, hue = 'bike_type')
g.map(sb.countplot, 'fare_type')
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of rentals based on fare type and bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'grey')
# obtain y_ticks and convert them to a multiple of thousands
y_tick_names = []
for ax in g.axes.flat:
for label in ax.get_yticklabels():
label_value = int(label.get_text())
label_value = '{:0.0f} K'.format(label_value/1000)
y_tick_names.append(label_value)
g.set_yticklabels(y_tick_names, size = 11)
g.set_xticklabels(size = 11)
g.set_xlabels('\nFare Type', size = 13)
g.set_ylabels('Rentals (thousands)\n', size = 13)
g.add_legend(bbox_to_anchor=(1.1, 0.6), scatterpoints=1,frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Bike type', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.18.b Facet Grid of rentals based on bike type and pass type.png', dpi=300, bbox_inches='tight')
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
The data needs some summirization by grouping together the respective variables.
categorical_counts = bikeshare.groupby(['fare_type', 'bike_type']).size()
categorical_counts
Now, lets reset the index and name the summerized data values accordingly.
categorical_counts = categorical_counts.reset_index(name = 'count')
categorical_counts
Now lets make a pivot from the original dataset into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'bike_type', columns = 'fare_type', values = 'count')
categorical_counts
The data is ready to plot the
Heat map.
# plot heat map
plt.figure(figsize = [4, 6])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':13}, linewidths=0.1, cmap="YlGnBu")
plt.title('Rentals based on fare type and bike type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nFare Type', fontsize=14)
plt.ylabel('Bike Type\n', fontsize=14)
plt.yticks(rotation=0, fontsize=12, va="center")
plt.xticks(fontsize=12);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1 %')
else:
t.set_text('{} %'.format(p));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.18.c Distribution of rentals based on pass type and bike type.png', dpi=300, bbox_inches='tight')
duration_min and trip_type columns:¶Columns: duration_min, trip_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted Barchartsb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, )
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, bikeshare.duration_min.max()+100, 100))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of rental durations based on trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.2f}'.format(y_label_value/1000000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (millions)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.a Facet Grid of rental durations on trip type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[9]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'trip_type', y = 'duration_min', inner = 'quartile',
color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'trip_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(x="trip_type", y="duration_min", data=bikeshare, alpha = 0.5, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.19.b Distribution of trip durations on various plots.png', dpi=300, bbox_inches='tight')
The above plot depicts the presense of long tail of outliers, and require closer observation for better understanding of the data distribution.
Calculate the average trip duration and the most frequent trip duration subjected to each trip type.
oneway_mean = math.ceil(bikeshare.query(' trip_type == "One Way" ').duration_min.mean())
oneway_mode = bikeshare.query(' trip_type == "One Way" ').duration_min.mode()[0]
roundtrip_mean = math.ceil(bikeshare.query(' trip_type == "Round Trip" ').duration_min.mean())
roundtrip_mode = bikeshare.query(' trip_type == "Round Trip" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('oneway_mean : ', oneway_mean, 'minutes')
print('roundtrip_mean : ', roundtrip_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('oneway_mode : ', oneway_mode, 'minutes')
print('roundtrip_mode : ', roundtrip_mode, 'minutes')
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = bikeshare, x = "trip_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration - Point plot\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = bikeshare.trip_type.sort_values(ascending=True).unique()
avg_rental_counts = bikeshare.groupby([bikeshare["trip_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
oneway_mode = bikeshare.query(' trip_type == "One Way" ').duration_min.mode()[0]
roundtrip_mode = bikeshare.query(' trip_type == "Round Trip" ').duration_min.mode()[0]
heights = [oneway_mode, roundtrip_mode]
labels = bikeshare.trip_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (freq_rental_max), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all rental durations based on trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.c Average trip durations based on trip type.png', dpi=300, bbox_inches='tight')
If the trip type is
One Way, then the bike rentals has an average rental duration of24minutes and mode of5minutes.If the trip type is
Round Trip, then the bike rentals has an average rental duration of64minutes and mode of1minutes.
The mode being 1 minute of round trips is probably because of return of the bicycle immediately after rental due to technical or other issue. Hence exclude the round trips under 1 minute and re-evalute its mode.
drop_index = bikeshare.query(' trip_type == "Round Trip" and duration_min <= 1 ').index
temp_df = bikeshare.drop(drop_index)
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "trip_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration - Point plot\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.trip_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["trip_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
oneway_mode = temp_df.query(' trip_type == "One Way" ').duration_min.mode()[0]
roundtrip_mode = temp_df.query(' trip_type == "Round Trip" ').duration_min.mode()[0]
heights = [oneway_mode, roundtrip_mode]
labels = temp_df.trip_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/5), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all rental durations based on trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.d Average trip durations based on trip type.png', dpi=300, bbox_inches='tight')
If the trip type is
One Way, then the bike rentals has an average rental duration of24minutes and mode of5minutes.If the trip type is
Round Trip(excluding1 minutetrips), then the bike rentals has an has an increase in average rental duration to71minutes and increase in mode to28minutes.
Dataset limited under 120 min:
bikeshare.duration_min.describe()
The distribution of
duration_minvalues are vague and not easy to interpret. This can be overcome by limiting the plot to a threshold value, say120minutes or2hours as most of theduration_minvalues falls under this boundary.
# calculate the percentage of the dataset that falls under `2 hour` trip duration.
np.round((bikeshare.query(' duration_min <= 120 ').shape[0]/bikeshare.shape[0])*100, 2)
duration_120 = bikeshare.query(' duration_min > 120 ')
# calculate the proportion of the 'One Way' trips that will be dropped
oneway_drops = np.round((duration_120.query(' trip_type == "One Way" ').shape[0]/
bikeshare.query(' trip_type == "One Way" ').shape[0])*100, 2)
# calculate the proportion of the 'Round' trips that will be dropped
round_drops = np.round((duration_120.query(' trip_type == "Round Trip" ').shape[0]/
bikeshare.query(' trip_type == "Round Trip" ').shape[0])*100, 2)
print("Proportion of oneway trips that will be dropped".ljust(50, ' '), ':', oneway_drops)
print("Proportion of round trips that will be dropped".ljust(50, ' '), ':', round_drops)
# Limit the dataset that has entries under 2 hours duration
duration_lim_120 = bikeshare.query(' duration_min <= 120 ')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
bin_edges = np.arange(0, duration_lim_120.duration_min.max()+10, 5)
plt.hist(duration_lim_120['duration_min'], color = base_color, bins = bin_edges)
plt.title('Distribution of trip durations under 2 hours\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 25000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.e Distribution of rental durations under 2 hours on trip type.png', dpi=300, bbox_inches='tight')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
g = sb.FacetGrid(data = duration_lim_120, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, xlim=(0,120))
g.set(xmargin=0.5, ymargin=0.5)
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, duration_lim_120.duration_min.max()+10, 5))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of rental durations under 120 min based on trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.f Facet Grid of rental durations under 2 hours on trip type.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [14, 5])
sb.set_style('darkgrid')
base_color = sb.color_palette()[9]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = duration_lim_120, x = 'trip_type', y = 'duration_min',
inner = 'quartile', color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = duration_lim_120, x = 'trip_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = duration_lim_120, x="trip_type", y="duration_min", alpha = 0.002, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Distribution of rental durations under 120 min based on trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.g Distribution of trip durations under 2 hours on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip duration and the most frequent trip duration subjected to each trip type.
oneway_mean = math.ceil(duration_lim_120.query(' trip_type == "One Way" ').duration_min.mean())
oneway_mode = duration_lim_120.query(' trip_type == "One Way" ').duration_min.mode()[0]
roundtrip_mean = math.ceil(duration_lim_120.query(' trip_type == "Round Trip" ').duration_min.mean())
roundtrip_mode = duration_lim_120.query(' trip_type == "Round Trip" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('oneway_mean : ', oneway_mean, 'minutes')
print('roundtrip_mean : ', roundtrip_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('oneway_mode : ', oneway_mode, 'minutes')
print('roundtrip_mode : ', roundtrip_mode, 'minutes')
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_lim_120, x = "trip_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration - Point plot\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = duration_lim_120.trip_type.sort_values(ascending=True).unique()
avg_rental_counts = duration_lim_120.groupby([duration_lim_120["trip_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
oneway_mode = duration_lim_120.query(' trip_type == "One Way" ').duration_min.mode()[0]
roundtrip_mode = duration_lim_120.query(' trip_type == "Round Trip" ').duration_min.mode()[0]
heights = [oneway_mode, roundtrip_mode]
labels = duration_lim_120.trip_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (freq_rental_max/2), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 120 min based on trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.h Average trip durations under 2 hours based on trip type.png', dpi=300, bbox_inches='tight')
If the trip type is
One Way, and dateset limited to trips under30 minutes, then the bike rentals has an average rental duration of16minutes and mode of5minutes.If the trip type is
Round Trip, and dateset limited to trips under30 minutes, then the bike rentals has an average rental duration of35minutes and mode of1minutes.
The mode being 1 minute of round trips is probably because of return of the bicycle immediately after rental due to technical or other issue. Hence exclude the round trips under 1 minute and re-evalute its mode.
drop_index = duration_lim_120.query(' trip_type == "Round Trip" and duration_min <= 1 ').index
temp_df = duration_lim_120.drop(drop_index)
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "trip_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration - Point plot\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+5, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.trip_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["trip_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.2, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
oneway_mode = temp_df.query(' trip_type == "One Way" ').duration_min.mode()[0]
roundtrip_mode = temp_df.query(' trip_type == "Round Trip" ').duration_min.mode()[0]
heights = [oneway_mode, roundtrip_mode]
labels = temp_df.trip_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+5, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 120 min based on trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.i Average trip durations under 2 hours based on trip type.png', dpi=300, bbox_inches='tight')
If the trip type is
One Way, and dateset limited to trips under30 minutes, then the bike rentals has an average rental duration of16minutes and mode of5minutes.If the trip type is
Round Trip, and dateset limited to trips30 minutes(excluding1 minutetrips), then the bike rentals has an increase in average rental duration to39minutes and increase in mode to28minutes.
Dataset limited under 30 min:
bikeshare.duration_min.describe()
The calculations are influenced by the presence of outliers. Calculate the average duration by limiting the data set to entries under
30 minutes, which constitute the75%of the duration distribution.
# Limit the dataset that has entries under 30 minutes duration
duration_lim_30 = bikeshare.query(' duration_min <= 30 ')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
bin_edges = np.arange(0, duration_lim_30.duration_min.max()+2, 1)
plt.hist(duration_lim_30['duration_min'], color = base_color, bins = bin_edges)
plt.title('Distribution of trip durations under 30 minutes\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 25000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.j Distribution of rental durations under 30 minutes on trip type.png', dpi=300, bbox_inches='tight')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
g = sb.FacetGrid(data = duration_lim_30, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1)
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, duration_lim_30.duration_min.max()+2, 1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of rental durations under 30 min based on trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.k Facet Grid of rental durations under 30 minutes on trip type.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [14, 5])
sb.set_style('darkgrid')
base_color = sb.color_palette()[9]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = duration_lim_30, x = 'trip_type', y = 'duration_min',
inner = 'quartile', color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = duration_lim_30, x = 'trip_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = duration_lim_30, x="trip_type", y="duration_min", alpha = 0.002, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Distribution of rental durations under 30 min based on trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.l Distribution of trip durations under 30 minutes on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip duration and the most frequent trip duration subjected to each trip type.
oneway_mean = math.ceil(duration_lim_30.query(' trip_type == "One Way" ').duration_min.mean())
oneway_mode = duration_lim_30.query(' trip_type == "One Way" ').duration_min.mode()[0]
roundtrip_mean = math.ceil(duration_lim_30.query(' trip_type == "Round Trip" ').duration_min.mean())
roundtrip_mode = duration_lim_30.query(' trip_type == "Round Trip" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('oneway_mean : ', oneway_mean, 'minutes')
print('roundtrip_mean : ', roundtrip_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('oneway_mode : ', oneway_mode, 'minutes')
print('roundtrip_mode : ', roundtrip_mode, 'minutes')
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_lim_30, x = "trip_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration - Point plot\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = duration_lim_30.trip_type.sort_values(ascending=True).unique()
avg_rental_counts = duration_lim_30.groupby([duration_lim_30["trip_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
oneway_mode = duration_lim_30.query(' trip_type == "One Way" ').duration_min.mode()[0]
roundtrip_mode = duration_lim_30.query(' trip_type == "Round Trip" ').duration_min.mode()[0]
heights = [oneway_mode, roundtrip_mode]
labels = duration_lim_30.trip_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+5, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc+0.1, count + (freq_rental_max/3), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 30 min based on trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.m Average trip durations under 30 minutes based on trip type.png', dpi=300, bbox_inches='tight')
If the trip type is
One Way, and dateset limited to trips under30 minutes, then the bike rentals has an average rental duration of10minutes and mode of5minutes.If the trip type is
Round Trip, and dateset limited to trips under30 minutes, then the bike rentals has an average rental duration of11minutes and mode of1minutes.
The mode being 1 minute of round trips is probably because of return of the bicycle immediately after rental due to technical or other issue. Hence exclude the round trips under 1 minute and re-evalute its mode.
drop_index = duration_lim_30.query(' trip_type == "Round Trip" and duration_min <= 1 ').index
temp_df = duration_lim_30.drop(drop_index)
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.8)
base_color = sb.color_palette()[9]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "trip_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration - Point plot\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+15, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.trip_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["trip_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + int(avg_rental_max/5), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
oneway_mode = temp_df.query(' trip_type == "One Way" ').duration_min.mode()[0]
roundtrip_mode = temp_df.query(' trip_type == "Round Trip" ').duration_min.mode()[0]
heights = [oneway_mode, roundtrip_mode]
labels = temp_df.trip_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+5, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 30 min based on trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.n Average trip durations under 30 minutes based on trip type.png', dpi=300, bbox_inches='tight')
If the trip type is
One Way, and dateset limited to trips under30 minutes, then the bike rentals has an average rental duration of11minutes and mode of5minutes.If the trip type is
Round Trip, and dateset limited to trips30 minutes(excluding1 minutetrips), then the bike rentals has an increase in average rental duration to18minutes and increase in mode to28minutes.
Tabular data of the average trip durations based on the dataset limitation of duration entries:
| Dataset used to measure - | Avg trip duration (min) |
|---|---|
| One Way | Round Trip | |
|---|---|---|
| trips under 30 minutes | 11 | 18 |
| trips under 120 minutes | 16 | 39 |
| overall trips | 24 | 71 |
Tabular data of the most frequent trip durations based on the dataset limitation of duration entries:
| Dataset used to measure - | Most freq trip duration (min) |
|---|---|
| One Way | Round Trip | |
|---|---|---|
| trips under 30 minutes | 5 | 28 |
| trips under 120 minutes | 5 | 28 |
| overall trips | 5 | 28 |
duration_df = pd.DataFrame()
duration_df['dataset'] = ['< 30', '< 30', '< 120', '< 120', 'overall', 'overall']
duration_df['trip_type'] = ['One Way', 'Round Trip', 'One Way', 'Round Trip', 'One Way', 'Round Trip']
duration_df['duration_avg'] = [11, 18, 16, 39, 24, 71]
duration_df['duration_mode'] = [5, 28, 5, 28, 5, 28]
duration_df
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "GnBu", n_colors = 3, desat = None)
base_color = sb.color_palette()[2]
# left plot: point plot - Avg trip duration
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = duration_df, x = 'trip_type', y = 'duration_avg', hue = 'dataset')
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color ='dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5, ncol = 1,
title='Dataset', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
g = sb.barplot(data = duration_df, x = 'trip_type', y = 'duration_mode', hue = 'dataset')
plt.title('Most frequent durations\n', weight = 'bold', fontsize = 16, color ='dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5, ncol = 1,
title='Dataset', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Assessment of Trip durations based on dataset over trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.o Assessment of Trip durations based on dataset over trip type.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [12, 6])
flatui = ['deepskyblue', 'sandybrown']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# left plot: point plot - Avg trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_df, x = 'dataset', y = 'duration_avg', hue = 'trip_type')
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
# add annotations
# -------------------------------------------------------
locs = [0, 0, 1, 1, 2, 2]
avg_rental_counts = duration_df["duration_avg"]
avg_rental_types = duration_df["trip_type"]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (trip == "Round Trip") else 'limegreen' for trip in avg_rental_types ]
# get the current tick locations and labels
# locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, avg_rental_count, clr in zip(locs, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.2, count + int(avg_rental_max/20), pct_string, ha = 'center', color = 'black', fontsize = 12,
bbox={'pad':1.9,'alpha':0.2,'color':'none','fc':clr})
# -------------------------------------------------------
plt.legend('', frameon=False, fancybox=False)
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# -------------------------------------------------------
# right plot: point plot - most frequent trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
sb.pointplot(data = duration_df, x = 'dataset', y = 'duration_mode', hue = 'trip_type')
plt.title('Most frequent durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
# add annotations
# -------------------------------------------------------
locs = [0, 0, 1, 1, 2, 2]
freq_rental_counts = duration_df["duration_mode"]
freq_rental_types = duration_df["trip_type"]
freq_rental_max = freq_rental_counts.max()
clrs = ['gold' if (trip == "Round Trip") else 'limegreen' for trip in freq_rental_types ]
# loop through each pair of locations and labels
for loc, freq_rental_count, clr in zip(locs, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + int(freq_rental_max/5), pct_string, ha = 'center', color = 'black', fontsize = 12,
bbox={'pad':1.9,'alpha':0.2,'color':'none','fc':clr})
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5, ncol = 2,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.5, 1.5))
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.65)
plt.suptitle('Assessment of trip durations based on trip type over dataset\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.19.p Assessment of trip durations based on trip type over dataset.png', dpi=300, bbox_inches='tight')
duration_min and bike_type columns:¶Columns: duration_min, bike_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted Barchartsb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
g = sb.FacetGrid(data = bikeshare, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1)
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, bikeshare.duration_min.max()+100, 100))
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of rental durations based on bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.1f} M'.format(y_label_value/1000000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (millions)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.a Facet Grid of rental durations on bike type.png', dpi=300, bbox_inches='tight')
The distribution of
duration_minvalues are vague and not easy to interpret. This can be overcome by limiting the plot to a threshold value. Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
sb.set_style('darkgrid')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'bike_type', y = 'duration_min', inner = 'quartile',
color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'bike_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(x="bike_type", y="duration_min", data=bikeshare, alpha = 0.05, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.20.b Distribution of Bike type durations on various plots.png', dpi=300, bbox_inches='tight')
The above plot depicts the presense of long tail of outliers, and require closer observation for better understanding of the data distribution.
Calculate the average trip duration and the most frequent trip duration subjected to each bike type.
unknown_mean = math.ceil(bikeshare.query(' bike_type == "unknown" ').duration_min.mean())
unknown_mode = bikeshare.query(' bike_type == "unknown" ').duration_min.mode()[0]
standard_mean = math.ceil(bikeshare.query(' bike_type == "Standard" ').duration_min.mean())
standard_mode = bikeshare.query(' bike_type == "Standard" ').duration_min.mode()[0]
electric_mean = math.ceil(bikeshare.query(' bike_type == "Electric" ').duration_min.mean())
electric_mode = bikeshare.query(' bike_type == "Electric" ').duration_min.mode()[0]
smart_mean = math.ceil(bikeshare.query(' bike_type == "Smart" ').duration_min.mean())
smart_mode = bikeshare.query(' bike_type == "Smart" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('unknown_mean : ', unknown_mean, 'minutes')
print('standard_mean : ', standard_mean, 'minutes')
print('electric_mean : ', electric_mean, 'minutes')
print('smart_mean : ', smart_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('unknown_mode : ', unknown_mode, 'minutes')
print('standard_mode : ', standard_mode, 'minutes')
print('electric_mode : ', electric_mode, 'minutes')
print('smart_mode : ', smart_mode, 'minutes')
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = bikeshare, x = "bike_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = bikeshare.bike_type.sort_values(ascending=True).unique()
avg_rental_counts = bikeshare.groupby([bikeshare["bike_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
unknown_mode = bikeshare.query(' bike_type == "unknown" ').duration_min.mode()[0]
standard_mode = bikeshare.query(' bike_type == "Standard" ').duration_min.mode()[0]
electric_mode = bikeshare.query(' bike_type == "Electric" ').duration_min.mode()[0]
smart_mode = bikeshare.query(' bike_type == "Smart" ').duration_min.mode()[0]
heights = [unknown_mode, standard_mode, electric_mode, smart_mode]
labels = bikeshare.bike_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (freq_rental_max/2), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all rental durations based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.c Average trip durations based on bike type.png', dpi=300, bbox_inches='tight')
- If the
bike_typeisunknown, then the bike rentals has an average rental duration of31 minutesand mode of6 minutes.- If the
bike_typeisStandard, then the bike rentals has an average rental duration of31 minutesand mode of5 minutes.- If the
bike_typeisElectric, then the bike rentals has an average rental duration of24 minutesand mode of4 minutes.- If the
bike_typeisSmart, then the bike rentals has an average rental duration of42 minutesand mode of1 minute.
The mode being 1 minute is probably because of return of the bicycle immediately after rental due to technical or other issue. Hence exclude the trips that are under 1 minute duration and re-evalute the statistics.
drop_index = bikeshare.query(' duration_min <= 1 ').index
temp_df = bikeshare.drop(drop_index)
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "bike_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.bike_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["bike_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
unknown_mode = temp_df.query(' bike_type == "unknown" ').duration_min.mode()[0]
standard_mode = temp_df.query(' bike_type == "Standard" ').duration_min.mode()[0]
electric_mode = temp_df.query(' bike_type == "Electric" ').duration_min.mode()[0]
smart_mode = temp_df.query(' bike_type == "Smart" ').duration_min.mode()[0]
heights = [unknown_mode, standard_mode, electric_mode, smart_mode]
labels = temp_df.bike_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/2), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all rental durations based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.d Average trip durations based on bike type.png', dpi=300, bbox_inches='tight')
When the trips with durations <= 1 minute are removed from the assessment, then the result is as follows:
- If the
bike_typeisunknown, then the bike rentals has an average rental duration of31 minutesand mode of6 minutes.- If the
bike_typeisStandard, then the bike rentals has an average rental duration of31 minutesand mode of5 minutes.- If the
bike_typeisElectric, then the bike rentals has an increase in average rental duration to25 minutesand mode of4 minutes.- If the
bike_typeisSmart, then the bike rentals has an increase in average rental duration to45 minutesand considerable increase in mode to7 minute.
Dataset limited under 120 min:
bikeshare.duration_min.describe()
The distribution of
duration_minvalues are vague and not easy to interpret. This can be overcome by limiting the dataset to a threshold value, say120 minutesor2 hoursas most of theduration_minvalues falls under this boundary.
# calculate the percentage of the dataset that falls under `2 hour` trip duration.
np.round((bikeshare.query(' duration_min <= 120 ').shape[0]/bikeshare.shape[0])*100, 2)
duration_120 = bikeshare.query(' duration_min > 120 ')
# calculate the proportion of the trips with 'unknown' bike type, that will be dropped
unknown_type_drops = np.round((duration_120.query(' bike_type == "unknown" ').shape[0]/
bikeshare.query(' bike_type == "unknown" ').shape[0])*100, 2)
# calculate the proportion of the trips with 'Standard' bike type, that will be dropped
standard_type_drops = np.round((duration_120.query(' bike_type == "Standard" ').shape[0]/
bikeshare.query(' bike_type == "Standard" ').shape[0])*100, 2)
# calculate the proportion of the trips with 'Electric' bike type, that will be dropped
electric_type_drops = np.round((duration_120.query(' bike_type == "Electric" ').shape[0]/
bikeshare.query(' bike_type == "Electric" ').shape[0])*100, 2)
# calculate the proportion of the trips with 'Smart' bike type, that will be dropped
smart_type_drops = np.round((duration_120.query(' bike_type == "Smart" ').shape[0]/
bikeshare.query(' bike_type == "Smart" ').shape[0])*100, 2)
print("Proportion of unknown type entries that will be dropped".ljust(60, ' '), ':', unknown_type_drops)
print("Proportion of standard type entries that will be dropped".ljust(60, ' '), ':', standard_type_drops)
print("Proportion of electric type entries that will be dropped".ljust(60, ' '), ':', electric_type_drops)
print("Proportion of smart type entries that will be dropped".ljust(60, ' '), ':', smart_type_drops)
# Limit the dataset that has entries under 2 hours duration
duration_lim_120 = bikeshare.query(' duration_min <= 120 ')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
bin_edges = np.arange(0, duration_lim_120.duration_min.max()+10, 5)
plt.hist(duration_lim_120['duration_min'], color = base_color, bins = bin_edges)
plt.title('Distribution of trip durations under 2 hours\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 50000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.e Distribution of trip durations under 2 hours.png', dpi=300, bbox_inches='tight')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
g = sb.FacetGrid(data = duration_lim_120, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, xlim=(0,120))
g.set(xmargin=0.5, ymargin=0.5)
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, duration_lim_120.duration_min.max()+10, 5))
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of rental durations under 120 min based on bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of thousands
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.f Facet Grid of rental durations under 2 hours on bike type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 5])
sb.set_style('darkgrid')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = duration_lim_120, x = 'bike_type', y = 'duration_min',
inner = 'quartile', color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = duration_lim_120, x = 'bike_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = duration_lim_120, x="bike_type", y="duration_min", alpha = 0.002, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Distribution of rental durations under 120 min based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.g Distribution of Bike type durations under 2 hours on various plots.png', dpi=300, bbox_inches='tight')
The above plot depicts the presense of long tail of outliers, and require even closer observation for better understanding of the data distribution.
Calculate the average trip duration and the most frequent trip duration subjected to each bike type.
unknown_mean = math.ceil(duration_lim_120.query(' bike_type == "unknown" ').duration_min.mean())
unknown_mode = duration_lim_120.query(' bike_type == "unknown" ').duration_min.mode()[0]
standard_mean = math.ceil(duration_lim_120.query(' bike_type == "Standard" ').duration_min.mean())
standard_mode = duration_lim_120.query(' bike_type == "Standard" ').duration_min.mode()[0]
electric_mean = math.ceil(duration_lim_120.query(' bike_type == "Electric" ').duration_min.mean())
electric_mode = duration_lim_120.query(' bike_type == "Electric" ').duration_min.mode()[0]
smart_mean = math.ceil(duration_lim_120.query(' bike_type == "Smart" ').duration_min.mean())
smart_mode = duration_lim_120.query(' bike_type == "Smart" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('unknown_mean : ', unknown_mean, 'minutes')
print('standard_mean : ', standard_mean, 'minutes')
print('electric_mean : ', electric_mean, 'minutes')
print('smart_mean : ', smart_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('unknown_mode : ', unknown_mode, 'minutes')
print('standard_mode : ', standard_mode, 'minutes')
print('electric_mode : ', electric_mode, 'minutes')
print('smart_mode : ', smart_mode, 'minutes')
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_lim_120, x = "bike_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = duration_lim_120.bike_type.sort_values(ascending=True).unique()
avg_rental_counts = duration_lim_120.groupby([duration_lim_120["bike_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.2, count + int(avg_rental_max/7), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
unknown_mode = duration_lim_120.query(' bike_type == "unknown" ').duration_min.mode()[0]
standard_mode = duration_lim_120.query(' bike_type == "Standard" ').duration_min.mode()[0]
electric_mode = duration_lim_120.query(' bike_type == "Electric" ').duration_min.mode()[0]
smart_mode = duration_lim_120.query(' bike_type == "Smart" ').duration_min.mode()[0]
heights = [unknown_mode, standard_mode, electric_mode, smart_mode]
labels = duration_lim_120.bike_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (freq_rental_max/2), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 120 min based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.h Average trip durations under 2 hours based on bike type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 120 minutes:
- If the
bike_typeisunknown, then the bike rentals has an average rental duration of19 minutesand mode of6 minutes.- If the
bike_typeisStandard, then the bike rentals has an average rental duration of17 minutesand mode of5 minutes.- If the
bike_typeisElectric, then the bike rentals has an average rental duration of15 minutesand mode of4 minutes.- If the
bike_typeisSmart, then the bike rentals has an average rental duration of30 minutesand mode of1 minute.
The mode being 1 minute is probably because of return of the bicycle immediately after rental due to technical or other issue. Hence exclude the trips that are under 1 minute duration and re-evalute the statistics.
drop_index = duration_lim_120.query(' duration_min <= 1 ').index
temp_df = duration_lim_120.drop(drop_index)
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "bike_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.bike_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["bike_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
unknown_mode = temp_df.query(' bike_type == "unknown" ').duration_min.mode()[0]
standard_mode = temp_df.query(' bike_type == "Standard" ').duration_min.mode()[0]
electric_mode = temp_df.query(' bike_type == "Electric" ').duration_min.mode()[0]
smart_mode = temp_df.query(' bike_type == "Smart" ').duration_min.mode()[0]
heights = [unknown_mode, standard_mode, electric_mode, smart_mode]
labels = temp_df.bike_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/2), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 120 min based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.i Average trip durations under 2 hours based on bike type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 120 minutes (excluding 1 minute trips):
- If the
bike_typeisunknown, then the bike rentals has an average rental duration of19 minutesand mode of6 minutes.- If the
bike_typeisStandard, then the bike rentals has an average rental duration of17 minutesand mode of5 minutes.- If the
bike_typeisElectric, then the bike rentals has an increase in average rental duration to16 minutesand mode of4 minutes.- If the
bike_typeisSmart, then the bike rentals has an increase in average rental duration to31 minutesand increase in mode to7 minute.
Dataset limited under 30 min:
bikeshare.duration_min.describe()
The calculations are influenced by the presence of outliers. This can be overcome by limiting the dataset to a threshold value, say
30 minutesas more than 75% of theduration_minvalues falls under this boundary.
# Limit the dataset that has entries under 30 minutes duration
duration_lim_30 = bikeshare.query(' duration_min <= 30 ')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
bin_edges = np.arange(0, duration_lim_30.duration_min.max()+2, 1)
plt.hist(duration_lim_30['duration_min'], color = base_color, bins = bin_edges)
plt.title('Distribution of trip durations under 30 minutes\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 10000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.j Distribution of trip durations under 30 minutes.png', dpi=300, bbox_inches='tight')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
g = sb.FacetGrid(data = duration_lim_30, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, xlim=(0,30))
g.set(xmargin=0.5, ymargin=0.5)
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, duration_lim_30.duration_min.max()+2, 1))
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of rental durations under 30 min based on bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of thousands
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.k Facet Grid of rental durations under 30 minutes on bike type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 5])
sb.set_style('darkgrid')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = duration_lim_30, x = 'bike_type', y = 'duration_min',
inner = 'quartile', color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = duration_lim_30, x = 'bike_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = duration_lim_30, x="bike_type", y="duration_min", alpha = 0.002, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Distribution of rental durations under 30 min based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.l Distribution of Bike type durations under 30 minutes on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip duration and the most frequent trip duration subjected to each bike type.
unknown_mean = math.ceil(duration_lim_30.query(' bike_type == "unknown" ').duration_min.mean())
unknown_mode = duration_lim_30.query(' bike_type == "unknown" ').duration_min.mode()[0]
standard_mean = math.ceil(duration_lim_30.query(' bike_type == "Standard" ').duration_min.mean())
standard_mode = duration_lim_30.query(' bike_type == "Standard" ').duration_min.mode()[0]
electric_mean = math.ceil(duration_lim_30.query(' bike_type == "Electric" ').duration_min.mean())
electric_mode = duration_lim_30.query(' bike_type == "Electric" ').duration_min.mode()[0]
smart_mean = math.ceil(duration_lim_30.query(' bike_type == "Smart" ').duration_min.mean())
smart_mode = duration_lim_30.query(' bike_type == "Smart" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('unknown_mean : ', unknown_mean, 'minutes')
print('standard_mean : ', standard_mean, 'minutes')
print('electric_mean : ', electric_mean, 'minutes')
print('smart_mean : ', smart_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('unknown_mode : ', unknown_mode, 'minutes')
print('standard_mode : ', standard_mode, 'minutes')
print('electric_mode : ', electric_mode, 'minutes')
print('smart_mode : ', smart_mode, 'minutes')
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_lim_30, x = "bike_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = duration_lim_30.bike_type.sort_values(ascending=True).unique()
avg_rental_counts = duration_lim_30.groupby([duration_lim_30["bike_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.2, count + int(avg_rental_max/7), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
unknown_mode = duration_lim_30.query(' bike_type == "unknown" ').duration_min.mode()[0]
standard_mode = duration_lim_30.query(' bike_type == "Standard" ').duration_min.mode()[0]
electric_mode = duration_lim_30.query(' bike_type == "Electric" ').duration_min.mode()[0]
smart_mode = duration_lim_30.query(' bike_type == "Smart" ').duration_min.mode()[0]
heights = [unknown_mode, standard_mode, electric_mode, smart_mode]
labels = duration_lim_30.bike_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+5, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (freq_rental_max/3), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 30 min based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.m Average trip durations under 30 minutes based on bike type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 30 minutes:
- If the
bike_typeisunknown, then the bike rentals has an average rental duration of12 minutesand mode of6 minutes.- If the
bike_typeisStandard, then the bike rentals has an average rental duration of11 minutesand mode of5 minutes.- If the
bike_typeisElectric, then the bike rentals has an average rental duration of13 minutesand mode of4 minutes.- If the
bike_typeisSmart, then the bike rentals has an average rental duration of15 minutesand mode of1 minute.
The mode being 1 minute is probably because of return of the bicycle immediately after rental due to technical or other issue. Hence exclude the trips that are under 1 minute duration and re-evalute the statistics.
drop_index = duration_lim_30.query(' duration_min <= 1 ').index
temp_df = duration_lim_30.drop(drop_index)
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[2]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "bike_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.bike_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["bike_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + int(avg_rental_max/7), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
unknown_mode = temp_df.query(' bike_type == "unknown" ').duration_min.mode()[0]
standard_mode = temp_df.query(' bike_type == "Standard" ').duration_min.mode()[0]
electric_mode = temp_df.query(' bike_type == "Electric" ').duration_min.mode()[0]
smart_mode = temp_df.query(' bike_type == "Smart" ').duration_min.mode()[0]
heights = [unknown_mode, standard_mode, electric_mode, smart_mode]
labels = temp_df.bike_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/3), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 30 min based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.n Average trip durations under 30 minutes based on bike type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 30 minutes (excluding 1 minute trips):
- If the
bike_typeisunknown, then the bike rentals has an average rental duration of12 minutesand mode of6 minutes.- If the
bike_typeisStandard, then the bike rentals has an average rental duration of11 minutesand mode of5 minutes.- If the
bike_typeisElectric, then the bike rentals has an average rental duration of13 minutesand mode of4 minutes.- If the
bike_typeisSmart, then the bike rentals has an increase in average rental duration to16 minutesand increase in mode to7 minute.
Tabular data of the average trip durations based on the dataset limitation of duration entries:
| Dataset used to measure - | Avg trip duration (min) |
|---|---|
| unknown | Standard | Electric | Smart | |
|---|---|---|---|---|
| trips under 30 minutes | 12 | 11 | 13 | 16 |
| trips under 120 minutes | 19 | 17 | 16 | 31 |
| overall trips | 31 | 31 | 25 | 45 |
Tabular data of the most frequent trip durations based on the dataset limitation of duration entries:
| Dataset used to measure - | Most freq trip duration (min) |
|---|---|
| unknown | Standard | Electric | Smart | |
|---|---|---|---|---|
| trips under 30 minutes | 6 | 5 | 4 | 7 |
| trips under 120 minutes | 6 | 5 | 4 | 7 |
| overall trips | 6 | 5 | 4 | 7 |
duration_df = pd.DataFrame()
duration_df['dataset'] = ['< 30', '< 30', '< 30', '< 30',
'< 120', '< 120', '< 120', '< 120',
'overall', 'overall', 'overall', 'overall']
duration_df['bike_type'] = ['unknown', 'Standard', 'Electric', 'Smart',
'unknown', 'Standard', 'Electric', 'Smart',
'unknown', 'Standard', 'Electric', 'Smart']
duration_df['duration_avg'] = [12, 11, 13, 16,
19, 17, 16, 31,
31, 31, 25, 45]
duration_df['duration_mode'] = [6, 5, 4, 7,
6, 5, 4, 7,
6, 5, 4, 7]
duration_df
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "GnBu", n_colors = 3, desat = None)
base_color = sb.color_palette()[2]
# left plot: point plot - Avg trip duration
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = duration_df, x = 'bike_type', y = 'duration_avg', hue = 'dataset')
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color ='dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend('', frameon=False, fancybox=False)
# add annotations
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5, 1.5, 2.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
g = sb.barplot(data = duration_df, x = 'bike_type', y = 'duration_mode', hue = 'dataset')
plt.title('Most frequent durations\n', weight = 'bold', fontsize = 16, color ='dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Trip durations', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.15, 1))
# add annotations
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5, 1.5, 2.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Assessment of durations based on dataset over bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.o Assessment of durations based on dataset over bike type.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [12, 5])
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.8)
# left plot: point plot - Avg trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_df, x = 'dataset', y = 'duration_avg', hue = 'bike_type', alpha = 1)
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend('', frameon=False, fancybox=False)
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# -------------------------------------------------------
# right plot: point plot - most frequent trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
g = sb.pointplot(data = duration_df, x = 'dataset', y = 'duration_mode', hue = 'bike_type', alpha = 1)
plt.title('Most frequent durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Assessment of trip durations based on bike type over datasets\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.p Assessment of trip durations based on bike type over datasets.png', dpi=300, bbox_inches='tight')
The lines subjected to each bike type are closely packed and hard to interpret. Hence plot the distribution of trip duratioins based on bike type over different datasets in bar type chart.
plt.figure(figsize = [12, 6])
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.6)
# left plot: bar chart - Avg trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = duration_df, x = 'dataset', y = 'duration_avg', hue = 'bike_type', alpha = 1)
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5, 1.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4)
# -------------------------------------------------------
plt.legend('', frameon=False, fancybox=False)
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# -------------------------------------------------------
# right plot: bar chart - most frequent trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
g = sb.barplot(data = duration_df, x = 'dataset', y = 'duration_mode', hue = 'bike_type', alpha = 1)
plt.title('Most frequent trip durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5, 1.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4)
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5, ncol = 2,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.5, 1.5))
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.65)
plt.suptitle('Assessment of trip durations based on bike type over datasets\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.20.q Assessment of trip durations based on bike type over datasets.png', dpi=300, bbox_inches='tight')
duration_min and pass_type columns:¶Columns: duration_min, pass_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted Barchartsb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1)
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, bikeshare.duration_min.max()+100, 100))
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of rental durations based on pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.1f} M'.format(y_label_value/1000000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (millions)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.a Facet Grid of rental durations on pass type.png', dpi=300, bbox_inches='tight')
The distribution of duration_min values are vague and not easy to interpret. This can be overcome by limiting the plot to a threshold value.
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
sb.set_style('darkgrid')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'pass_type', y = 'duration_min', inner = 'quartile',
color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'pass_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(x="pass_type", y="duration_min", data=bikeshare, alpha = 0.5, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.21.b Distribution of Pass type durations on various plots.png', dpi=300, bbox_inches='tight')
The above plot depicts the presense of long tail of outliers, and require closer observation for better understanding of the data distribution.
Calculate the average trip duration and the most frequent trip duration subjected to each pass type.
walkup_mean = math.ceil(bikeshare.query(' pass_type == "Walk-up" ').duration_min.mean())
walkup_mode = bikeshare.query(' pass_type == "Walk-up" ').duration_min.mode()[0]
oneday_mean = math.ceil(bikeshare.query(' pass_type == "One Day" ').duration_min.mean())
oneday_mode = bikeshare.query(' pass_type == "One Day" ').duration_min.mode()[0]
monthly_mean = math.ceil(bikeshare.query(' pass_type == "Monthly" ').duration_min.mean())
monthly_mode = bikeshare.query(' pass_type == "Monthly" ').duration_min.mode()[0]
flex_mean = math.ceil(bikeshare.query(' pass_type == "Flex" ').duration_min.mean())
flex_mode = bikeshare.query(' pass_type == "Flex" ').duration_min.mode()[0]
annual_mean = math.ceil(bikeshare.query(' pass_type == "Annual" ').duration_min.mean())
annual_mode = bikeshare.query(' pass_type == "Annual" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('walkup_mean : ', walkup_mean, 'minutes')
print('oneday_mean : ', oneday_mean, 'minutes')
print('monthly_mean : ', monthly_mean, 'minutes')
print('flex_mean : ', flex_mean, 'minutes')
print('annual_mean : ', annual_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('walkup_mode : ', walkup_mode, 'minutes')
print('oneday_mode : ', oneday_mode, 'minutes')
print('monthly_mode : ', monthly_mode, 'minutes')
print('flex_mode : ', flex_mode, 'minutes')
print('annual_mode : ', annual_mode, 'minutes')
The calculations are influenced by the presence of outliers.
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = bikeshare, x = "pass_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = bikeshare.pass_type.sort_values(ascending=True).unique()
avg_rental_counts = bikeshare.groupby([bikeshare["pass_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc+0.2, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
walkup_mode = bikeshare.query(' pass_type == "Walk-up" ').duration_min.mode()[0]
oneday_mode = bikeshare.query(' pass_type == "One Day" ').duration_min.mode()[0]
monthly_mode = bikeshare.query(' pass_type == "Monthly" ').duration_min.mode()[0]
flex_mode = bikeshare.query(' pass_type == "Flex" ').duration_min.mode()[0]
annual_mode = bikeshare.query(' pass_type == "Annual" ').duration_min.mode()[0]
heights = [walkup_mode, oneday_mode, monthly_mode, flex_mode, annual_mode]
labels = bikeshare.pass_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (freq_rental_max/2), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all rental durations based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.c Average trip durations based on pass type.png', dpi=300, bbox_inches='tight')
- If the
pass_typeisWalk-up, then the bike rentals has an average rental duration of51 minutesand mode of10 minutes.- If the
pass_typeisOne Day, then the bike rentals has an average rental duration of61 minutesand mode of8 minutes.- If the
pass_typeisMonthly, then the bike rentals has an average rental duration of15 minutesand mode of5 minutes.- If the
pass_typeisFlex, then the bike rentals has an average rental duration of17 minutesand mode of3 minutes.- If the
pass_typeisAnnual, then the bike rentals has an average rental duration of24 minutesand mode of5 minutes.
The trip durations being 1 minute or less is probably because of return of the bicycle immediately after rental due to technical or other issue. Hence exclude the trips that are under 1 minute duration and re-evalute the statistics.
drop_index = bikeshare.query(' duration_min <= 1 ').index
temp_df = bikeshare.drop(drop_index)
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "pass_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.pass_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["pass_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc+0.2, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
walkup_mode = temp_df.query(' pass_type == "Walk-up" ').duration_min.mode()[0]
oneday_mode = temp_df.query(' pass_type == "One Day" ').duration_min.mode()[0]
monthly_mode = temp_df.query(' pass_type == "Monthly" ').duration_min.mode()[0]
flex_mode = temp_df.query(' pass_type == "Flex" ').duration_min.mode()[0]
annual_mode = temp_df.query(' pass_type == "Annual" ').duration_min.mode()[0]
heights = [walkup_mode, oneday_mode, monthly_mode, flex_mode, annual_mode]
labels = temp_df.pass_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/2), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all rental durations based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.d Average trip durations based on pass type.png', dpi=300, bbox_inches='tight')
When the trips with durations <= 1 minute are removed from the assessment, then the result is as follows:
- If the
pass_typeisWalk-up, then the bike rentals has an increase in average rental duration to52 minutesand mode of10 minutes.- If the
pass_typeisOne Day, then the bike rentals has an average rental duration of61 minutesand mode of8 minutes.- If the
pass_typeisMonthly, then the bike rentals has an average rental duration of15 minutesand mode of5 minutes.- If the
pass_typeisFlex, then the bike rentals has an average rental duration of17 minutesand mode of3 minutes.- If the
pass_typeisAnnual, then the bike rentals has an increase in average rental duration to25 minutesand mode of5 minutes.
Dataset limited under 120 min:
bikeshare.duration_min.describe()
The distribution of
duration_minvalues are vague and not easy to interpret. This can be overcome by limiting the dataset to a threshold value, say120 minutesor2 hoursas most of theduration_minvalues falls under this boundary.
# calculate the percentage of the dataset that falls under `2 hour` trip duration.
np.round((bikeshare.query(' duration_min <= 120 ').shape[0]/bikeshare.shape[0])*100, 2)
duration_120 = bikeshare.query(' duration_min > 120 ')
# calculate the proportion of the trips with 'Walk-up' pass type, that will be dropped
walkup_type_drops = np.round((duration_120.query(' pass_type == "Walk-up" ').shape[0]/
bikeshare.query(' pass_type == "Walk-up" ').shape[0])*100, 2)
# calculate the proportion of the trips with 'One Day' pass type, that will be dropped
oneday_type_drops = np.round((duration_120.query(' pass_type == "One Day" ').shape[0]/
bikeshare.query(' pass_type == "One Day" ').shape[0])*100, 2)
# calculate the proportion of the trips with 'Monthly' pass type, that will be dropped
monthly_type_drops = np.round((duration_120.query(' pass_type == "Monthly" ').shape[0]/
bikeshare.query(' pass_type == "Monthly" ').shape[0])*100, 2)
# calculate the proportion of the trips with 'Flex' pass type, that will be dropped
flex_type_drops = np.round((duration_120.query(' pass_type == "Flex" ').shape[0]/
bikeshare.query(' pass_type == "Flex" ').shape[0])*100, 2)
# calculate the proportion of the trips with 'Annual' pass type, that will be dropped
annual_type_drops = np.round((duration_120.query(' pass_type == "Annual" ').shape[0]/
bikeshare.query(' pass_type == "Annual" ').shape[0])*100, 2)
print("Proportion of walkup type entries that will be dropped".ljust(60, ' '), ':', walkup_type_drops)
print("Proportion of oneday type entries that will be dropped".ljust(60, ' '), ':', oneday_type_drops)
print("Proportion of monthly type entries that will be dropped".ljust(60, ' '), ':', monthly_type_drops)
print("Proportion of flex type entries that will be dropped".ljust(60, ' '), ':', flex_type_drops)
print("Proportion of annual type entries that will be dropped".ljust(60, ' '), ':', annual_type_drops)
# Limit the dataset that has entries under 2 hours duration
duration_lim_120 = bikeshare.query(' duration_min <= 120 ')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
bin_edges = np.arange(0, duration_lim_120.duration_min.max()+10, 5)
plt.hist(duration_lim_120['duration_min'], color = base_color, bins = bin_edges)
plt.title('Distribution of trip durations under 2 hours\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 50000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.e Distribution of trip durations under 2 hours.png', dpi=300, bbox_inches='tight')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
g = sb.FacetGrid(data = duration_lim_120, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1, xlim=(0,120))
g.set(xmargin=0.5, ymargin=0.5)
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, duration_lim_120.duration_min.max()+10, 5))
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of rental durations under 120 min based on pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of thousands
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.f Facet Grid of rental durations under 2 hours on pass type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 5])
sb.set_style('darkgrid')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = duration_lim_120, x = 'pass_type', y = 'duration_min',
inner = 'quartile', color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = duration_lim_120, x = 'pass_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = duration_lim_120, x="pass_type", y="duration_min", alpha = 0.002, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.2, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Distribution of rental durations under 120 min based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.g Distribution of pass type durations under 2 hours on various plots.png', dpi=300, bbox_inches='tight')
The above plot depicts the presense of long tail of outliers, and require even closer observation for better understanding of the data distribution.
Calculate the average trip duration and the most frequent trip duration subjected to each pass type.
walkup_mean = math.ceil(duration_lim_120.query(' pass_type == "Walk-up" ').duration_min.mean())
walkup_mode = duration_lim_120.query(' pass_type == "Walk-up" ').duration_min.mode()[0]
oneday_mean = math.ceil(duration_lim_120.query(' pass_type == "One Day" ').duration_min.mean())
oneday_mode = duration_lim_120.query(' pass_type == "One Day" ').duration_min.mode()[0]
monthly_mean = math.ceil(duration_lim_120.query(' pass_type == "Monthly" ').duration_min.mean())
monthly_mode = duration_lim_120.query(' pass_type == "Monthly" ').duration_min.mode()[0]
flex_mean = math.ceil(duration_lim_120.query(' pass_type == "Flex" ').duration_min.mean())
flex_mode = duration_lim_120.query(' pass_type == "Flex" ').duration_min.mode()[0]
annual_mean = math.ceil(duration_lim_120.query(' pass_type == "Annual" ').duration_min.mean())
annual_mode = duration_lim_120.query(' pass_type == "Annual" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('walkup_mean : ', walkup_mean, 'minutes')
print('oneday_mean : ', oneday_mean, 'minutes')
print('monthly_mean : ', monthly_mean, 'minutes')
print('flex_mean : ', flex_mean, 'minutes')
print('annual_mean : ', annual_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('walkup_mode : ', walkup_mode, 'minutes')
print('oneday_mode : ', oneday_mode, 'minutes')
print('monthly_mode : ', monthly_mode, 'minutes')
print('flex_mode : ', flex_mode, 'minutes')
print('annual_mode : ', annual_mode, 'minutes')
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_lim_120, x = "pass_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = duration_lim_120.pass_type.sort_values(ascending=True).unique()
avg_rental_counts = duration_lim_120.groupby([duration_lim_120["pass_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc+0.2, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
walkup_mode = duration_lim_120.query(' pass_type == "Walk-up" ').duration_min.mode()[0]
oneday_mode = duration_lim_120.query(' pass_type == "One Day" ').duration_min.mode()[0]
monthly_mode = duration_lim_120.query(' pass_type == "Monthly" ').duration_min.mode()[0]
flex_mode = duration_lim_120.query(' pass_type == "Flex" ').duration_min.mode()[0]
annual_mode = duration_lim_120.query(' pass_type == "Annual" ').duration_min.mode()[0]
heights = [walkup_mode, oneday_mode, monthly_mode, flex_mode, annual_mode]
labels = duration_lim_120.pass_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (freq_rental_max/3), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 120 min based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.h Average trip durations under 2 hours based on pass type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 120 minutes:
- If the
pass_typeisWalk-up, then the bike rentals has an average rental duration of32 minutesand mode of10 minutes.- If the
pass_typeisOne Day, then the bike rentals has an average rental duration of31 minutesand mode of8 minutes.- If the
pass_typeisMonthly, then the bike rentals has an average rental duration of12 minutesand mode of5 minutes.- If the
pass_typeisFlex, then the bike rentals has an average rental duration of12 minutesand mode of3 minutes.- If the
pass_typeisAnnual, then the bike rentals has an average rental duration of12 minutesand mode of5 minutes.
The trip durations being 1 minute or less is probably because of return of the bicycle immediately after rental due to technical or other issue. Hence exclude the trips that are under 1 minute duration and re-evalute the statistics.
drop_index = duration_lim_120.query(' duration_min <= 1 ').index
temp_df = duration_lim_120.drop(drop_index)
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "pass_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.pass_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["pass_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc+0.2, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
walkup_mode = temp_df.query(' pass_type == "Walk-up" ').duration_min.mode()[0]
oneday_mode = temp_df.query(' pass_type == "One Day" ').duration_min.mode()[0]
monthly_mode = temp_df.query(' pass_type == "Monthly" ').duration_min.mode()[0]
flex_mode = temp_df.query(' pass_type == "Flex" ').duration_min.mode()[0]
annual_mode = temp_df.query(' pass_type == "Annual" ').duration_min.mode()[0]
heights = [walkup_mode, oneday_mode, monthly_mode, flex_mode, annual_mode]
labels = temp_df.pass_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/3), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 120 min based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.i Average trip durations under 2 hours based on pass type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 120 minutes (excluding 1 minute trips):
- If the
pass_typeisWalk-up, then the bike rentals has an average rental duration of32 minutesand mode of10 minutes.- If the
pass_typeisOne Day, then the bike rentals has an average rental duration of31 minutesand mode of8 minutes.- If the
pass_typeisMonthly, then the bike rentals has an average rental duration of12 minutesand mode of5 minutes.- If the
pass_typeisFlex, then the bike rentals has an average rental duration of12 minutesand mode of3 minutes.- If the
pass_typeisAnnual, then the bike rentals has an increase in average rental duration to13 minutesand mode of5 minutes.
Dataset limited under 30 min:
bikeshare.duration_min.describe()
The calculations are influenced by the presence of outliers. This can be overcome by limiting the dataset to a threshold value, say
30 minutesas more than75%of theduration_minvalues falls under this boundary.
# Limit the dataset that has entries under 30 minutes duration
duration_lim_30 = bikeshare.query(' duration_min <= 30 ')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
bin_edges = np.arange(0, duration_lim_30.duration_min.max()+2, 1)
plt.hist(duration_lim_30['duration_min'], color = base_color, bins = bin_edges)
plt.title('Distribution of trip durations under 30 minutes\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 10000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.j Average trip durations under 30 minutes.png', dpi=300, bbox_inches='tight')
sb.set_style('white')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
g = sb.FacetGrid(data = duration_lim_30, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1, xlim=(0,30))
g.set(xmargin=0.5, ymargin=0.5)
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, duration_lim_30.duration_min.max()+2, 1))
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of rental durations under 30 min based on pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of thousands
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.k Facet Grid of rental durations under 30 minutes on pass type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 5])
sb.set_style('darkgrid')
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = duration_lim_30, x = 'pass_type', y = 'duration_min',
inner = 'quartile', color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = duration_lim_30, x = 'pass_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = duration_lim_30, x="pass_type", y="duration_min", alpha = 0.002, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.2, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Distribution of rental durations under 30 min based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.l Distribution of pass type durations under 30 minutes on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip duration and the most frequent trip duration subjected to each pass type.
walkup_mean = math.ceil(duration_lim_30.query(' pass_type == "Walk-up" ').duration_min.mean())
walkup_mode = duration_lim_30.query(' pass_type == "Walk-up" ').duration_min.mode()[0]
oneday_mean = math.ceil(duration_lim_30.query(' pass_type == "One Day" ').duration_min.mean())
oneday_mode = duration_lim_30.query(' pass_type == "One Day" ').duration_min.mode()[0]
monthly_mean = math.ceil(duration_lim_30.query(' pass_type == "Monthly" ').duration_min.mean())
monthly_mode = duration_lim_30.query(' pass_type == "Monthly" ').duration_min.mode()[0]
flex_mean = math.ceil(duration_lim_30.query(' pass_type == "Flex" ').duration_min.mean())
flex_mode = duration_lim_30.query(' pass_type == "Flex" ').duration_min.mode()[0]
annual_mean = math.ceil(duration_lim_30.query(' pass_type == "Annual" ').duration_min.mean())
annual_mode = duration_lim_30.query(' pass_type == "Annual" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('walkup_mean : ', walkup_mean, 'minutes')
print('oneday_mean : ', oneday_mean, 'minutes')
print('monthly_mean : ', monthly_mean, 'minutes')
print('flex_mean : ', flex_mean, 'minutes')
print('annual_mean : ', annual_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('walkup_mode : ', walkup_mode, 'minutes')
print('oneday_mode : ', oneday_mode, 'minutes')
print('monthly_mode : ', monthly_mode, 'minutes')
print('flex_mode : ', flex_mode, 'minutes')
print('annual_mode : ', annual_mode, 'minutes')
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_lim_30, x = "pass_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = duration_lim_30.pass_type.sort_values(ascending=True).unique()
avg_rental_counts = duration_lim_30.groupby([duration_lim_30["pass_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc+0.2, count + int(avg_rental_max/7), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
walkup_mode = duration_lim_30.query(' pass_type == "Walk-up" ').duration_min.mode()[0]
oneday_mode = duration_lim_30.query(' pass_type == "One Day" ').duration_min.mode()[0]
monthly_mode = duration_lim_30.query(' pass_type == "Monthly" ').duration_min.mode()[0]
flex_mode = duration_lim_30.query(' pass_type == "Flex" ').duration_min.mode()[0]
annual_mode = duration_lim_30.query(' pass_type == "Annual" ').duration_min.mode()[0]
heights = [walkup_mode, oneday_mode, monthly_mode, flex_mode, annual_mode]
labels = duration_lim_30.pass_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (freq_rental_max/4), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 30 min based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.m Average trip durations under 30 minutes based on pass type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 30 minutes:
- If the
pass_typeisWalk-up, then the bike rentals has an average rental duration of16 minutesand mode of10 minutes.- If the
pass_typeisOne Day, then the bike rentals has an average rental duration of15 minutesand mode of8 minutes.- If the
pass_typeisMonthly, then the bike rentals has an average rental duration of10 minutesand mode of5 minutes.- If the
pass_typeisFlex, then the bike rentals has an average rental duration of10 minutesand mode of3 minutes.- If the
pass_typeisAnnual, then the bike rentals has an average rental duration of10 minutesand mode of5 minutes.
The trip durations being 1 minute or less is probably because of return of the bikes immediately after rental due to technical or other issue. Hence exclude the trips that are under 1 minute duration and re-evalute the statistics.
drop_index = duration_lim_30.query(' duration_min <= 1 ').index
temp_df = duration_lim_30.drop(drop_index)
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "colorblind", n_colors = 10, desat = 0.6)
base_color = sb.color_palette()[6]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "pass_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.pass_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["pass_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc+0.2, count + int(avg_rental_max/8), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
walkup_mode = temp_df.query(' pass_type == "Walk-up" ').duration_min.mode()[0]
oneday_mode = temp_df.query(' pass_type == "One Day" ').duration_min.mode()[0]
monthly_mode = temp_df.query(' pass_type == "Monthly" ').duration_min.mode()[0]
flex_mode = temp_df.query(' pass_type == "Flex" ').duration_min.mode()[0]
annual_mode = temp_df.query(' pass_type == "Annual" ').duration_min.mode()[0]
heights = [walkup_mode, oneday_mode, monthly_mode, flex_mode, annual_mode]
labels = temp_df.pass_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (freq_rental_max/5), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 30 min based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.n Average trip durations under 30 minutes based on pass type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 30 minutes (excluding 1 minute trips):
- If the
pass_typeisWalk-up, then the bike rentals has an increase in average rental duration to17 minutesand mode of10 minutes.- If the
pass_typeisOne Day, then the bike rentals has an increase in average rental duration to16 minutesand mode of8 minutes.- If the
pass_typeisMonthly, then the bike rentals has an increase in average rental duration to11 minutesand mode of5 minutes.- If the
pass_typeisFlex, then the bike rentals has an average rental duration of10 minutesand mode of3 minutes.- If the
pass_typeisAnnual, then the bike rentals has an average rental duration of10 minutesand mode of5 minutes.
Tabular data of the average trip durations based on the dataset limitation of duration entries:
| Dataset used to measure - | Avg trip duration (min) |
|---|---|
| Walk-up | One Day | Monthly | Flex | Annual | |
|---|---|---|---|---|---|
| trips under 30 minutes | 17 | 16 | 11 | 10 | 10 |
| trips under 120 minutes | 32 | 31 | 12 | 12 | 13 |
| overall trips | 52 | 61 | 15 | 17 | 25 |
Tabular data of the most frequent trip durations based on the dataset limitation of duration entries:
| Dataset used to measure - | Most freq trip duration (min) |
|---|---|
| Walk-up | One Day | Monthly | Flex | Annual | |
|---|---|---|---|---|---|
| trips under 30 minutes | 10 | 8 | 5 | 3 | 5 |
| trips under 120 minutes | 10 | 8 | 5 | 3 | 5 |
| overall trips | 10 | 8 | 5 | 3 | 5 |
duration_df = pd.DataFrame()
duration_df['dataset'] = ['< 30', '< 30', '< 30', '< 30', '< 30',
'< 120', '< 120', '< 120', '< 120', '< 120',
'overall', 'overall', 'overall', 'overall', 'overall']
duration_df['pass_type'] = ['Walk-up', 'One Day', 'Monthly', 'Flex', 'Annual',
'Walk-up', 'One Day', 'Monthly', 'Flex', 'Annual',
'Walk-up', 'One Day', 'Monthly', 'Flex', 'Annual']
duration_df['duration_avg'] = [17, 16, 11, 10, 10,
32, 31, 12, 12, 13,
52, 61, 15, 17, 25]
duration_df['duration_mode'] = [10, 8, 5, 3, 5,
10, 8, 5, 3, 5,
10, 8, 5, 3, 5]
duration_df
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "GnBu", n_colors = 3, desat = None)
base_color = sb.color_palette()[2]
# left plot: point plot - Avg trip duration
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = duration_df, x = 'pass_type', y = 'duration_avg', hue = 'dataset')
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color ='dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend('', frameon=False, fancybox=False)
# add annotations
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5, 1.5, 2.5, 3.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
g = sb.barplot(data = duration_df, x = 'pass_type', y = 'duration_mode', hue = 'dataset')
plt.title('Most frequent durations\n', weight = 'bold', fontsize = 16, color ='dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Trip durations', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.2, 1))
# add annotations
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5, 1.5, 2.5, 3.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Assessment of trip durations based on dataset over bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.o Assessment of trip durations based on dataset over bike type.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [12, 5])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.8)
# left plot: point plot - Avg trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_df, x = 'dataset', y = 'duration_avg', hue = 'pass_type', alpha = 0.8)
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend('', frameon=False, fancybox=False)
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# -------------------------------------------------------
# right plot: point plot - most frequent trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
g = sb.pointplot(data = duration_df, x = 'dataset', y = 'duration_mode', hue = 'pass_type', alpha = 0.8)
plt.title('Most frequent durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Assessment of trip durations based on pass type over datasets\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.p Assessment of trip durations based on pass type over datasets.png', dpi=300, bbox_inches='tight')
The lines subjected to each pass type are closely packed and hard to interpret. Hence plot the distribution of trip duratioins based on pass type over different datasets in bar type chart.
Observation: There is a clear influence of outliers on the average trip durations. This can be observed between Walk-up and One day average trip durations. As the dataset increased/included outliers the difference between them increased. Same can be observed between Monthly, Flex and Annual trip durations.
plt.figure(figsize = [12, 6])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=None)
# left plot: bar chart - Avg trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = duration_df, x = 'dataset', y = 'duration_avg', hue = 'pass_type', alpha = 0.8)
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5, 1.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4)
# -------------------------------------------------------
plt.legend('', frameon=False, fancybox=False)
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# -------------------------------------------------------
# right plot: bar chart - most frequent trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
g = sb.barplot(data = duration_df, x = 'dataset', y = 'duration_mode', hue = 'pass_type', alpha = 0.8)
plt.title('Most frequent trip durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5, 1.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4)
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5, ncol = 2,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.5, 1.5))
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.65)
plt.suptitle('Assessment of trip durations based on pass type over datasets\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.21.q Assessment of trip durations based on pass type over datasets.png', dpi=300, bbox_inches='tight')
duration_min and fare_type columns:¶Columns: duration_min, trip_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted Barchartsb.set_style('white')
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
g = sb.FacetGrid(data = bikeshare, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1, )
g.map(plt.hist, "duration_min", color = base_color,
bins = np.arange(0, bikeshare.duration_min.max()+100, 100), alpha = 0.8)
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of rental durations based on fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.1f} M'.format(y_label_value/1000000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (millions)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.a Facet Grid of rental durations on fare type.png', dpi=300, bbox_inches='tight')
The distribution of duration_min values are vague and not easy to interpret. This can be overcome by limiting the plot to a threshold value.
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'fare_type', y = 'duration_min', inner = 'quartile',
color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'fare_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot( data = bikeshare, x = "fare_type", y = "duration_min", alpha = 0.5, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.22.b Distribution of fare type durations on various plots.png', dpi=300, bbox_inches='tight')
The above plot depicts the presense of long tail of outliers, and require closer observation for better understanding of the data distribution.
Calculate the average trip duration and the most frequent trip duration subjected to each fare type.
base_mean = math.ceil(bikeshare.query(' fare_type == "Base" ').duration_min.mean())
base_mode = bikeshare.query(' fare_type == "Base" ').duration_min.mode()[0]
extended_mean = math.ceil(bikeshare.query(' fare_type == "Extended" ').duration_min.mean())
extended_mode = bikeshare.query(' fare_type == "Extended" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('base_mean : ', base_mean, 'minutes')
print('extended_mean : ', extended_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('base_mode : ', base_mode, 'minutes')
print('extended_mode : ', extended_mode, 'minutes')
The calculations are influenced by the presence of outliers
plt.figure(figsize = [12, 5])
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = bikeshare, x = "fare_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+5, 25)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = bikeshare.fare_type.sort_values(ascending=True).unique()
avg_rental_counts = bikeshare.groupby([bikeshare["fare_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
base_mode = bikeshare.query(' fare_type == "Base" ').duration_min.mode()[0]
extended_mode = bikeshare.query(' fare_type == "Extended" ').duration_min.mode()[0]
heights = [base_mode, extended_mode]
labels = bikeshare.fare_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+5, 25)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + (freq_rental_max/3), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all rental durations based on fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.c Average trip durations based on fare type.png', dpi=300, bbox_inches='tight')
- If the
fare_typeisBase, then the bike rentals has an average rental duration of12 minutesand mode of6 minutes.- If the
fare_typeisExtended, then the bike rentals has an average rental duration of119 minutesand mode of31 minutes.
The trip durations being 1 minute or less is probably because of return of the bikes immediately after rental due to technical or other issue. Hence exclude the trips that are under 1 minute duration and re-evalute the statistics.
drop_index = bikeshare.query(' duration_min <= 1 ').index
temp_df = bikeshare.drop(drop_index)
plt.figure(figsize = [12, 5])
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "fare_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+5, 25)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.fare_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["fare_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
base_mode = temp_df.query(' fare_type == "Base" ').duration_min.mode()[0]
extended_mode = temp_df.query(' fare_type == "Extended" ').duration_min.mode()[0]
heights = [base_mode, extended_mode]
labels = temp_df.fare_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+5, 25)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/2), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all rental durations based on fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.d Average trip durations based on fare type.png', dpi=300, bbox_inches='tight')
When the trips with durations <= 1 minute are removed from the assessment, then the result is as follows:
- If the
fare_typeisBase, then the bike rentals has an average rental duration of12 minutesand mode of6 minutes.- If the
fare_typeisExtended, then the bike rentals has an average rental duration of119 minutesand mode of31 minutes.
Dataset limited under 120 min:
bikeshare.duration_min.describe()
The distribution of
duration_minvalues are vague and not easy to interpret. This can be overcome by limiting the dataset to a threshold value, say120 minutesor2 hoursas most of theduration_minvalues falls under this boundary.
# calculate the percentage of the dataset that falls under `2 hour` trip duration.
np.round((bikeshare.query(' duration_min <= 120 ').shape[0]/bikeshare.shape[0])*100, 2)
duration_120 = bikeshare.query(' duration_min > 120 ')
# calculate the proportion of the trips with 'Walk-up' pass type, that will be dropped
base_type_drops = np.round((duration_120.query(' fare_type == "Base" ').shape[0]/
bikeshare.query(' fare_type == "Base" ').shape[0])*100, 2)
# calculate the proportion of the trips with 'One Day' pass type, that will be dropped
extended_type_drops = np.round((duration_120.query(' fare_type == "Extended" ').shape[0]/
bikeshare.query(' fare_type == "Extended" ').shape[0])*100, 2)
print("Proportion of base type entries that will be dropped".ljust(60, ' '), ':', base_type_drops)
print("Proportion of extended type entries that will be dropped".ljust(60, ' '), ':', extended_type_drops)
# Limit the dataset that has entries under 2 hours duration
duration_lim_120 = bikeshare.query(' duration_min <= 120 ')
sb.set_style('white')
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
bin_edges = np.arange(0, duration_lim_120.duration_min.max()+10, 5)
plt.hist(duration_lim_120['duration_min'], color = base_color, bins = bin_edges)
plt.title('Distribution of trip durations under 2 hours\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 50000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.e Distribution of trip durations under 2 hours.png', dpi=300, bbox_inches='tight')
sb.set_style('white')
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
g = sb.FacetGrid(data = duration_lim_120, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1, xlim=(0,120))
g.set(xmargin=0.5, ymargin=0.5)
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, duration_lim_120.duration_min.max()+10, 5))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of rental durations under 120 min based on fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of thousands
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.f Facet Grid of rental durations under 2 hours on fare type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 5])
sb.set_style('darkgrid')
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = duration_lim_120, x = 'fare_type', y = 'duration_min',
inner = 'quartile', color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = duration_lim_120, x = 'fare_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = duration_lim_120, x="fare_type", y="duration_min", alpha = 0.002, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.2, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Distribution of rental durations under 120 min based on fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.g Distribution of fare type durations under 2 hours on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip duration and the most frequent trip duration subjected to each fare type.
base_mean = math.ceil(duration_lim_120.query(' fare_type == "Base" ').duration_min.mean())
base_mode = duration_lim_120.query(' fare_type == "Base" ').duration_min.mode()[0]
extended_mean = math.ceil(duration_lim_120.query(' fare_type == "Extended" ').duration_min.mean())
extended_mode = duration_lim_120.query(' fare_type == "Extended" ').duration_min.mode()[0]
print('Duration mean'.center(30,'-'))
print('base_mean : ', base_mean, 'minutes')
print('extended_mean : ', extended_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('base_mode : ', base_mode, 'minutes')
print('extended_mode : ', extended_mode, 'minutes')
plt.figure(figsize = [12, 5])
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_lim_120, x = "fare_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = duration_lim_120.fare_type.sort_values(ascending=True).unique()
avg_rental_counts = duration_lim_120.groupby([duration_lim_120["fare_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
base_mode = duration_lim_120.query(' fare_type == "Base" ').duration_min.mode()[0]
extended_mode = duration_lim_120.query(' fare_type == "Extended" ').duration_min.mode()[0]
heights = [base_mode, extended_mode]
labels = duration_lim_120.fare_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/6), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 120 min based on fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.h Average trip durations under 2 hours based on fare type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 120 minutes:
- If the
fare_typeisBase, then the bike rentals has an average rental duration of12 minutesand mode of6 minutes.- If the
fare_typeisExtended, then the bike rentals has an average rental duration of57 minutesand mode of31 minutes.
The trip durations being 1 minute or less is probably because of return of the bikes immediately after rental due to technical or other issue. Hence exclude the trips that are under 1 minute duration and re-evalute the statistics.
drop_index = duration_lim_120.query(' duration_min <= 1 ').index
temp_df = duration_lim_120.drop(drop_index)
plt.figure(figsize = [12, 5])
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = temp_df, x = "fare_type", y = "duration_min", linestyles = "-", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
cat_order = temp_df.fare_type.sort_values(ascending=True).unique()
avg_rental_counts = temp_df.groupby([temp_df["fare_type"]]).mean().duration_min[cat_order]
avg_rental_max = avg_rental_counts.max()
clrs = ['gold' if (count > ((avg_rental_max*4)/5)) else 'limegreen' for count in avg_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + int(avg_rental_max/10), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
base_mode = temp_df.query(' fare_type == "Base" ').duration_min.mode()[0]
extended_mode = temp_df.query(' fare_type == "Extended" ').duration_min.mode()[0]
heights = [base_mode, extended_mode]
labels = temp_df.fare_type.sort_values(ascending=True).unique()
sb.pointplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 10)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
freq_rental_counts = heights
freq_rental_max = max(freq_rental_counts)
clrs = ['gold' if (count > ((freq_rental_max*4)/5)) else 'limegreen' for count in freq_rental_counts ]
# get the current tick locations and labels
locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, label, freq_rental_count, clr in zip(locs, labels, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.1, count + (freq_rental_max/6), pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 120 min based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.i Average trip durations under 2 hours based on fare type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 120 minutes (excluding 1 minute trips):
- If the
fare_typeisBase, then the bike rentals has an average rental duration of12 minutesand mode of6 minutes.- If the
fare_typeisExtended, then the bike rentals has an average rental duration of57 minutesand mode of31 minutes.
Dataset limited under 30 min:
bikeshare.duration_min.describe()
The calculations are influenced by the presence of outliers. This can be overcome by limiting the dataset to a threshold value, say
30 minutesas more than75%of theduration_minvalues falls under this boundary.
# Limit the dataset that has entries under 30 minutes duration
duration_lim_30 = bikeshare.query(' duration_min <= 30 ')
sb.set_style('white')
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
bin_edges = np.arange(0, duration_lim_30.duration_min.max()+2, 1)
plt.hist(duration_lim_30['duration_min'], color = base_color, bins = bin_edges)
plt.title('Distribution of trip durations under 30 minutes\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nDuration (minutes)', fontsize = 14)
plt.ylabel('Rentals (thousands)\n', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_tick_locs = np.arange(0, int(math.ceil(max(locs)))+1000, 10000)
y_tick_names = ['{:0.0f} K'.format(loc/1000) for loc in y_tick_locs]
plt.yticks(y_tick_locs, y_tick_names, fontsize=12)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.j Distribution of trip durations under 30 minutes.png', dpi=300, bbox_inches='tight')
sb.set_style('white')
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
g = sb.FacetGrid(data = duration_lim_30, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1, xlim=(0,30))
g.set(xmargin=0.5, ymargin=0.5)
g.map(plt.hist, "duration_min", color = base_color, bins = np.arange(0, duration_lim_30.duration_min.max()+2, 1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of rental durations under 30 min based on fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of thousands
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.k Facet Grid of rental durations under 30 minutes on fare type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 5])
sb.set_style('darkgrid')
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = duration_lim_30, x = 'fare_type', y = 'duration_min',
inner = 'quartile', color = base_color)
plt.title('Trip durations - Violin plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = duration_lim_30, x = 'fare_type', y = 'duration_min', color = base_color)
plt.title('Trip durations - Box plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = duration_lim_30, x="fare_type", y="duration_min", alpha = 0.002, color = base_color)
plt.title('Trip durations - Strip plot\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare Type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.2, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Distribution of rental durations under 30 min based on fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.l Distribution of fare type durations under 30 minutes on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip duration and the most frequent trip duration subjected to each fare type.
base_mean = math.ceil(duration_lim_30.query(' fare_type == "Base" ').duration_min.mean())
base_mode = duration_lim_30.query(' fare_type == "Base" ').duration_min.mode()[0]
# extended statistics are not calculated as they do not exist under 30 minutes
print('Duration mean'.center(30,'-'))
print('base_mean : ', base_mean, 'minutes')
print('\n')
print('Duration mode'.center(30,'-'))
print('base_mode : ', base_mode, 'minutes')
plt.figure(figsize = [12, 5])
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = duration_lim_30, x = "fare_type", y = "duration_min", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
base_mode = duration_lim_30.query(' fare_type == "Base" ').duration_min.mode()[0]
heights = [base_mode, 0]
labels = ['Base', 'Extended']
g = sb.barplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 30 min based on fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.m Average trip durations under 30 minutes based on fare type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 30 minutes:
- If the
fare_typeisBase, then the bike rentals has an average rental duration of11 minutesand mode of6 minutes.- The
fare_typeofExtendeddoes not exist under 30 minutes as they start after 30 minute duration.
The trip durations being 1 minute or less is probably because of return of the bikes immediately after rental due to technical or other issue. Hence exclude the trips that are under 1 minute duration and re-evalute the statistics.
drop_index = duration_lim_30.query(' duration_min <= 1 ').index
temp_df = duration_lim_30.drop(drop_index)
plt.figure(figsize = [12, 5])
flatui = ["gold"]
sb.set_palette(palette = flatui, n_colors = 10, desat = 0.7)
base_color = sb.color_palette()[0]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = temp_df, x = "fare_type", y = "duration_min", color = base_color)
plt.title('Avg. Trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
base_mode = temp_df.query(' fare_type == "Base" ').duration_min.mode()[0]
heights = [base_mode, 0]
labels = ['Base', 'Extended']
g = sb.barplot(x = labels, y = heights, color = base_color)
plt.title('Most frequent trip duration\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+10, 5)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
sb.despine();
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of rental durations under 30 min based on fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.n Average trip durations under 30 minutes based on fare type.png', dpi=300, bbox_inches='tight')
When the dataset is limited to trips under 30 minutes (excluding 1 minute trips):
- If the
fare_typeisBase, then the bike rentals has an increase in average rental duration to12 minutesand mode of6 minutes.- The
fare_typeofExtendeddoes not exist under 30 minutes as they start after 30 minute duration.
Tabular data of the average trip durations based on the dataset limitation of duration entries:
| Dataset used to measure - | Avg trip duration (min) |
|---|---|
| Base | Extended | |
|---|---|---|
| trips under 30 minutes | 12 | 0 |
| trips under 120 minutes | 12 | 57 |
| overall trips | 12 | 119 |
Tabular data of the most frequent trip durations based on the dataset limitation of duration entries:
| Dataset used to measure - | Most freq trip duration (min) |
|---|---|
| Base | Extended | |
|---|---|---|
| trips under 30 minutes | 6 | 0 |
| trips under 120 minutes | 6 | 31 |
| overall trips | 6 | 31 |
duration_df = pd.DataFrame()
duration_df['dataset'] = ['< 30', '< 30',
'< 120', '< 120',
'overall', 'overall']
duration_df['fare_type'] = ['Base', 'Extended',
'Base', 'Extended',
'Base', 'Extended']
duration_df['duration_avg'] = [12, np.nan,
12, 57,
12, 119]
duration_df['duration_mode'] = [6, np.nan,
6, 31,
6, 31]
duration_df
plt.figure(figsize = [12, 5])
sb.set_palette(palette = "GnBu", n_colors = 3, desat = None)
base_color = sb.color_palette()[2]
# left plot: point plot - Avg trip duration
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = duration_df, x = 'fare_type', y = 'duration_avg', hue = 'dataset')
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color ='dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+25, 25)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend('', frameon=False, fancybox=False)
# add annotations
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
g = sb.barplot(data = duration_df, x = 'fare_type', y = 'duration_mode', hue = 'dataset')
plt.title('Most frequent durations\n', weight = 'bold', fontsize = 16, color ='dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+25, 25)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Trip durations', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.2, 1))
# add annotations
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4);
sb.despine(top=True, bottom=False, left=False, right=True);
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Assessment of trip durations based on dataset over fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.o Assessment of trip durations based on dataset over fare type.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [12, 5])
flatui = ["#e278fa", "#787efa"]
sb.set_palette(flatui, n_colors=2, desat=0.8)
# left plot: point plot - Avg trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.pointplot(data = duration_df, x = 'dataset', y = 'duration_avg', hue = 'fare_type', alpha = 0.8)
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+25, 25)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend('', frameon=False, fancybox=False)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
locs = [0, 0, 1, 1, 2, 2]
avg_rental_counts = duration_df["duration_avg"]
avg_rental_types = duration_df["fare_type"]
avg_rental_max = avg_rental_counts.max()
clrs = ['mediumpurple' if (trip == "Extended") else 'violet' for trip in avg_rental_types ]
# get the current tick locations and labels
# locs, labels = plt.xticks()
# loop through each pair of locations and labels
for loc, avg_rental_count, clr in zip(locs, avg_rental_counts, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
# print the pct string if the count is not 'nan'
if count == count:
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc-0.2, count + int(avg_rental_max/20), pct_string, ha = 'center', color = 'black', fontsize = 12,
bbox={'pad':1.9,'alpha':0.2,'color':'none','fc':clr})
# -------------------------------------------------------
# -------------------------------------------------------
# right plot: point plot - most frequent trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
g = sb.pointplot(data = duration_df, x = 'dataset', y = 'duration_mode', hue = 'fare_type', alpha = 0.8)
plt.title('Most frequent durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+25, 25)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
locs = [0, 0, 1, 1, 2, 2]
freq_rental_counts = duration_df["duration_mode"]
freq_rental_types = duration_df["fare_type"]
freq_rental_max = freq_rental_counts.max()
clrs = ['mediumpurple' if (trip == "Extended") else 'violet' for trip in freq_rental_types ]
# loop through each pair of locations and labels
for loc, freq_rental_count, clr in zip(locs, freq_rental_counts, clrs):
try:
count = freq_rental_count
except KeyError:
count = 0
# print the pct string if the count is not 'nan'
if count == count:
pct_string = '{:0.0f} min'.format(math.ceil(count))
# print the annotation depending on the bar length
plt.text(loc, count + int(freq_rental_max/5), pct_string, ha = 'center', color = 'black', fontsize = 12,
bbox={'pad':1.9,'alpha':0.2,'color':'none','fc':clr})
# -------------------------------------------------------
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Assessment of trip durations based on fare type over datasets\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.p Assessment of trip durations based on fare type over datasets.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [12, 5])
flatui = ["#e278fa", "#787efa"]
sb.set_palette(flatui, n_colors=2, desat=0.8)
# left plot: point plot - Avg trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = duration_df, x = 'dataset', y = 'duration_avg', hue = 'fare_type', alpha = 0.8)
plt.title('Avg. Trip durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+25, 25)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend('', frameon=False, fancybox=False)
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5, 1.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4)
# -------------------------------------------------------
# -------------------------------------------------------
# right plot: point plot - most frequent trip duration
# -------------------------------------------------------
# -------------------------------------------------------
sb.set_style('white')
plt.subplot(1, 2, 2)
g = sb.barplot(data = duration_df, x = 'dataset', y = 'duration_mode', hue = 'fare_type', alpha = 0.8)
plt.title('Most frequent durations\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip durations (minutes)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
locs, labels = plt.yticks()
y_ticks_new = np.arange(0, int(math.ceil(max(locs)))+25, 25)
plt.yticks(y_ticks_new, y_ticks_new, fontsize=12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.1, 1))
sb.despine(top=True, bottom=False, left=False, right=True);
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
separators = [0.5, 1.5]
for loc in separators:
plt.axvline(loc, ls='--', color='grey', linewidth=1, alpha=0.4)
# -------------------------------------------------------
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.75)
plt.suptitle('Assessment of trip durations based on fare type over datasets\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.22.q Assessment of trip durations based on fare type over datasets.png', dpi=300, bbox_inches='tight')
distance_miles and trip_type columns:¶Columns: distance_miles, trip_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted Barchart# Observe the distribution of distances through descriptive statistics
bikeshare.distance_miles.describe()
Use the above descriptive statistics to set the
bin sizefor the upcoming plots
sb.set_style('white')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[0]
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, )
g.map(plt.hist, "distance_miles", color = base_color, bins = np.arange(0, bikeshare.distance_miles.max()+1, 1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of trip distances based on trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.1f} M'.format(y_label_value/1000000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (millions)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.23.a Facet Grid of trip distances on trip type.png', dpi=300, bbox_inches='tight')
The distribution of distance_miles values are right skewed and needs closer interpretation. This can be achieved by limiting the plot to a threshold value.
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'trip_type', y = 'distance_miles', inner = 'quartile',
color = base_color)
plt.title('Trip distances - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'trip_type', y = 'distance_miles', color = base_color)
plt.title('Trip distances - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "trip_type", y = "distance_miles", alpha = 0.5, color = base_color)
plt.title('Trip distances - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.23.b Distribution of Trip type distances on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip distance and the most frequent trip distance subjected to each trip type.
plt.figure(figsize = [12, 5])
sb.set_style('darkgrid')
sb.set_palette(palette = "GnBu_d", n_colors = 2, desat = None)
base_color = sb.color_palette()[0]
# left plot: bar chart - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = bikeshare, x = "trip_type", y = "distance_miles", hue = 'trip_type')
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
oneway_mode = bikeshare.query(' trip_type == "One Way" ').distance_miles.mode()[0]
roundtrip_mode = bikeshare.query(' trip_type == "Round Trip" ').distance_miles.mode()[0]
heights = [oneway_mode, roundtrip_mode]
labels = bikeshare.trip_type.sort_values(ascending=True).unique()
g = sb.barplot(x = labels, y = heights, hue = labels)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all trip distances based on trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.23.c Assessment of all trip distances based on trip type.png', dpi=300, bbox_inches='tight')
Dataset limited by 3 miles:
# calculate the percentage of the dataset that falls under `3 miles` trip duration.
np.round((bikeshare.query(' distance_miles <= 3 ').shape[0]/bikeshare.shape[0])*100, 2)
The calculations are influenced by the presence of outliers. Calculate the descriptive statistics by limiting the data set to entries under 3 miles, which constitute the 99% of the distance distribution.
# Limit the dataset that has entries under 3 miles distance
distance_lim_3 = bikeshare.query(' distance_miles <= 3 ')
sb.set_style('white')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[0]
g = sb.FacetGrid(data = distance_lim_3, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, xlim=(0,3))
g.map(plt.hist, "distance_miles", color = base_color, bins = np.arange(0, bikeshare.distance_miles.max()+0.1, 0.1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of trip distances based on trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.23.d Facet Grid of trip distances under 3 miles on trip type.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = distance_lim_3, x = 'trip_type', y = 'distance_miles', inner = 'quartile',
color = base_color)
plt.ylim(0, 3)
plt.title('Trip distances - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = distance_lim_3, x = 'trip_type', y = 'distance_miles', color = base_color)
plt.ylim(0, 3)
plt.title('Trip distances - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = distance_lim_3, x = "trip_type", y = "distance_miles", alpha = 0.005, color = base_color)
plt.ylim(0, 3)
plt.title('Trip distances - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.23.e Distribution of Trip type distances under 3 miles on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip distance and the most frequent trip distance subjected to each trip type.
plt.figure(figsize = [12, 5])
sb.set_style('darkgrid')
sb.set_palette(palette = "GnBu_d", n_colors = 2, desat = None)
base_color = sb.color_palette()[0]
# left plot: bar chart - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = distance_lim_3, x = "trip_type", y = "distance_miles", hue = 'trip_type')
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (minutes)\n', fontsize = 14)
plt.xlabel('\nTrip type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend('', frameon=False, fancybox=False)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
oneway_mode = distance_lim_3.query(' trip_type == "One Way" ').distance_miles.mode()[0]
roundtrip_mode = distance_lim_3.query(' trip_type == "Round Trip" ').distance_miles.mode()[0]
heights = [oneway_mode, roundtrip_mode]
labels = distance_lim_3.trip_type.sort_values(ascending=True).unique()
g = sb.barplot(x = labels, y = heights, hue = labels)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of trip distances under 3 miles based on trip type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.23.f Assessment of trip distances under 3 miles based on trip type.png', dpi=300, bbox_inches='tight')
distance_miles and bike_type columns:¶Columns: distance_miles, bike_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted Barchart# Observe the distribution of distances through descriptive statistics
bikeshare.distance_miles.describe()
Use the above descriptive statistics to set the
bin sizefor the upcoming plots
sb.set_style('white')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[3]
g = sb.FacetGrid(data = bikeshare, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1 )
g.map(plt.hist, "distance_miles", color = base_color, bins = np.arange(0, bikeshare.distance_miles.max()+1, 1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of all trip distances based on bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.24.a Facet Grid of trip distances on bike type.png', dpi=300, bbox_inches='tight')
The distribution of distance_miles values are right skewed and needs closer interpretation. This can be achieved by limiting the plot to a threshold value.
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
sb.set_style('darkgrid')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[3]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'bike_type', y = 'distance_miles', inner = 'quartile',
color = base_color)
plt.title('Trip distances - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'bike_type', y = 'distance_miles', color = base_color)
plt.title('Trip distances - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(x="bike_type", y="distance_miles", data=bikeshare, alpha = 0.5, color = base_color)
plt.title('Trip distances - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.24.b Distribution of Bike type distances on various plots.png', dpi=300, bbox_inches='tight')
plt.figure(figsize = [12, 5])
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.8)
# left plot: bar chart - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = bikeshare, x = "bike_type", y = "distance_miles", hue = 'bike_type', dodge=False)
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (miles)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend('', frameon=False, fancybox=False)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
unknown_mode = bikeshare.query(' bike_type == "unknown" ').distance_miles.mode()[0]
standard_mode = bikeshare.query(' bike_type == "Standard" ').distance_miles.mode()[0]
electric_mode = bikeshare.query(' bike_type == "Electric" ').distance_miles.mode()[0]
smart_mode = bikeshare.query(' bike_type == "Smart" ').distance_miles.mode()[0]
heights = [unknown_mode, standard_mode, electric_mode, smart_mode]
labels = bikeshare.bike_type.sort_values(ascending=True).unique()
g = sb.barplot(x = labels, y = heights, hue = labels, dodge=False)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all trip distances based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.24.c Average trip distances based on trip type.png', dpi=300, bbox_inches='tight')
The
Round Tripentries have a distance/displacement equal toZeroand clustered together unlikeOne Waytrips which are distributed between1-25 miles. Hence remove the entries with duration value0and re-evaluate the descriptive statistics, to calculate the correct mode values.
drop_index = bikeshare.query(' distance_miles == 0 ').index
temp_df = bikeshare.drop(drop_index)
plt.figure(figsize = [12, 5])
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.8)
# left plot: bar chart - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = temp_df, x = "bike_type", y = "distance_miles", hue = 'bike_type', dodge=False)
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (miles)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend('', frameon=False, fancybox=False)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
unknown_mode = temp_df.query(' bike_type == "unknown" ').distance_miles.mode()[0]
standard_mode = temp_df.query(' bike_type == "Standard" ').distance_miles.mode()[0]
electric_mode = temp_df.query(' bike_type == "Electric" ').distance_miles.mode()[0]
smart_mode = temp_df.query(' bike_type == "Smart" ').distance_miles.mode()[0]
heights = [unknown_mode, standard_mode, electric_mode, smart_mode]
labels = temp_df.bike_type.sort_values(ascending=True).unique()
g = sb.barplot(x = labels, y = heights, hue = labels, dodge=False)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5, ncol = 1,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all trip distances based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.24.d Assessment of all trip distances based on bike type.png', dpi=300, bbox_inches='tight')
Dataset limited by 3 miles:
# calculate the percentage of the dataset that falls under `3 miles` trip distance.
np.round((bikeshare.query(' distance_miles <= 3 ').shape[0]/bikeshare.shape[0])*100, 2)
The calculations are influenced by the presence of outliers. Calculate the descriptive statistics by limiting the data set to entries under 3 miles, which constitute the 99% of the distance distribution. Also remove the entries with distance valued Zero as they are clustered by Round Trip data and influence the actual statistics.
# calculate the percentage of the dataset that falls under `3 miles` trip distance and above '0 miles'.
np.round((bikeshare.query(' distance_miles <= 3 and distance_miles > 0 ').shape[0]/bikeshare.shape[0])*100, 2)
# Limit the dataset that has entries under 3 miles distance and above '0' miles
distance_lim_3 = bikeshare.query(' distance_miles <= 3 and distance_miles > 0 ')
sb.set_style('white')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[3]
g = sb.FacetGrid(data = distance_lim_3, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, xlim=(0,3))
g.map(plt.hist, "distance_miles", color = base_color, bins = np.arange(0, bikeshare.distance_miles.max()+0.1, 0.1))
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of trip distances under 3 miles based on bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDistance (miles)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.24.e Facet Grid of trip distances under 3 miles on bike type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
sb.set_style('darkgrid')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[3]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = distance_lim_3, x = 'bike_type', y = 'distance_miles', inner = 'quartile', color = base_color)
plt.title('Trip distances - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = distance_lim_3, x = 'bike_type', y = 'distance_miles', color = base_color)
plt.title('Trip distances - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = distance_lim_3, x = "bike_type", y = "distance_miles", alpha = 0.005, color = base_color)
plt.title('Trip distances - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.24.f Distribution of Bike type distances under 3 miles on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip distance and the most frequent trip distance subjected to each bike type.
plt.figure(figsize = [12, 5])
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.8)
# left plot: bar chart - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = distance_lim_3, x = "bike_type", y = "distance_miles", hue = 'bike_type', dodge=False)
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (miles)\n', fontsize = 14)
plt.xlabel('\nBike type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend('', frameon=False, fancybox=False)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
unknown_mode = distance_lim_3.query(' bike_type == "unknown" ').distance_miles.mode()[0]
standard_mode = distance_lim_3.query(' bike_type == "Standard" ').distance_miles.mode()[0]
electric_mode = distance_lim_3.query(' bike_type == "Electric" ').distance_miles.mode()[0]
smart_mode = distance_lim_3.query(' bike_type == "Smart" ').distance_miles.mode()[0]
heights = [unknown_mode, standard_mode, electric_mode, smart_mode]
labels = distance_lim_3.bike_type.sort_values(ascending=True).unique()
g = sb.barplot(x = labels, y = heights, hue = labels, dodge=False)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5, ncol = 1,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of trip distances under 3 miles based on bike type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.24.g Assessment of trip distances under 3 miles based on bike type.png', dpi=300, bbox_inches='tight')
distance_miles and pass_type columns:¶Columns: distance_miles, pass_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted Barchart# Observe the distribution of distances through descriptive statistics
bikeshare.distance_miles.describe()
Use the above descriptive statistics to set the
bin sizefor the upcoming plots
sb.set_style('white')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[4]
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1 )
g.map(plt.hist, "distance_miles", color = base_color, bins = np.arange(0, bikeshare.distance_miles.max()+1, 1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of trip distances based on pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.25.a Facet Grid of trip distances on pass type.png', dpi=300, bbox_inches='tight')
The distribution of distance_miles values are right skewed and needs closer interpretation. This can be achieved by limiting the plot to a threshold value.
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
sb.set_style('darkgrid')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[4]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'pass_type', y = 'distance_miles', inner = 'quartile',
color = base_color)
plt.title('Trip distances - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'pass_type', y = 'distance_miles', color = base_color)
plt.title('Trip distances - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "pass_type", y = "distance_miles", alpha = 0.5, color = base_color)
plt.title('Trip distances - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.25.b Distribution of Pass type durations on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip distance and the most frequent trip distance subjected to each pass type.
plt.figure(figsize = [12, 5])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.8)
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = bikeshare, x = "pass_type", y = "distance_miles", hue = 'pass_type', alpha = 0.8, dodge=False)
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (miles)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend('', frameon=False, fancybox=False)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
walkup_mode = bikeshare.query(' pass_type == "Walk-up" ').distance_miles.mode()[0]
oneday_mode = bikeshare.query(' pass_type == "One Day" ').distance_miles.mode()[0]
monthly_mode = bikeshare.query(' pass_type == "Monthly" ').distance_miles.mode()[0]
flex_mode = bikeshare.query(' pass_type == "Flex" ').distance_miles.mode()[0]
annual_mode = bikeshare.query(' pass_type == "Annual" ').distance_miles.mode()[0]
heights = [walkup_mode, oneday_mode, monthly_mode, flex_mode, annual_mode]
labels = bikeshare.pass_type.sort_values(ascending=True).unique()
g = sb.barplot(x = labels, y = heights, hue = labels, alpha = 0.8, dodge=False)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.ylim(ax1.get_ylim())
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1))
# add annotations
# -------------------------------------------------------
for p in g.patches:
g.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all trip distances based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.25.c Average trip distances based on pass type.png', dpi=300, bbox_inches='tight')
The
Round Tripentries have a distance/displacement equal toZeroand clustered together unlikeOne Waytrips which are distributed between1-25 miles. Hence remove the entries with duration value0and re-evaluate the descriptive statistics, to calculate the correct mode values.
drop_index = bikeshare.query(' distance_miles == 0 ').index
temp_df = bikeshare.drop(drop_index)
plt.figure(figsize = [12, 5])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.8)
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = temp_df, x = "pass_type", y = "distance_miles", hue = 'pass_type',
alpha = 0.8, dodge=False)
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (miles)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend('', frameon=False, fancybox=False)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
walkup_mode = temp_df.query(' pass_type == "Walk-up" ').distance_miles.mode()[0]
oneday_mode = temp_df.query(' pass_type == "One Day" ').distance_miles.mode()[0]
monthly_mode = temp_df.query(' pass_type == "Monthly" ').distance_miles.mode()[0]
flex_mode = temp_df.query(' pass_type == "Flex" ').distance_miles.mode()[0]
annual_mode = temp_df.query(' pass_type == "Annual" ').distance_miles.mode()[0]
heights = [walkup_mode, oneday_mode, monthly_mode, flex_mode, annual_mode]
labels = temp_df.pass_type.sort_values(ascending=True).unique()
ax2 = sb.barplot(x = labels, y = heights, hue = labels, alpha = 0.8, dodge=False)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
# plt.ylim(ax1.get_ylim())
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1))
# add annotations
# -------------------------------------------------------
for p in ax2.patches:
ax2.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# increase first plot's y axis limit if second plot's y axis limit is higher
if ax1.get_ylim() < ax2.get_ylim():
ax1.set_ylim(ax2.get_ylim())
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all trip distances based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.25.d Assessment of all trip distances based on pass type.png', dpi=300, bbox_inches='tight')
Dataset limited by 3 miles:
# calculate the percentage of the dataset that falls under `3 miles` trip distance.
np.round((bikeshare.query(' distance_miles <= 3 ').shape[0]/bikeshare.shape[0])*100, 2)
The calculations are influenced by the presence of outliers. Calculate the descriptive statistics by limiting the data set to entries under 3 miles, which constitute the 99% of the distance distribution. Also remove the entries with distance valued Zero as they are clustered by Round Trip data and influence the actual statistics.
# calculate the percentage of the dataset that falls under `3 miles` trip distance and above '0 miles'.
np.round((bikeshare.query(' distance_miles <= 3 and distance_miles > 0 ').shape[0]/bikeshare.shape[0])*100, 2)
# Limit the dataset that has entries under 3 miles distance and above '0' miles
distance_lim_3 = bikeshare.query(' distance_miles <= 3 and distance_miles > 0 ')
sb.set_style('white')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[4]
g = sb.FacetGrid(data = distance_lim_3, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1, xlim=(0,3))
g.map(plt.hist, "distance_miles", color = base_color, bins = np.arange(0, bikeshare.distance_miles.max()+0.1, 0.1))
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of trip distances under 3 miles based on pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDistance (miles)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.25.e Facet Grid of trip distances under 3 miles on bike type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
sb.set_style('darkgrid')
sb.set_palette(palette = "GnBu_d", n_colors = 5, desat = None)
base_color = sb.color_palette()[4]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = distance_lim_3, x = 'pass_type', y = 'distance_miles', inner = 'quartile',
color = base_color)
plt.title('Trip distances - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = distance_lim_3, x = 'pass_type', y = 'distance_miles', color = base_color)
plt.title('Trip distances - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = distance_lim_3, x = "pass_type", y = "distance_miles", alpha = 0.5, color = base_color)
plt.title('Trip distances - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.25.f Distribution of Pass type durations under 3 miles on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip distance and the most frequent trip distance subjected to each pass type.
plt.figure(figsize = [12, 5])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.8)
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = distance_lim_3, x = "pass_type", y = "distance_miles", hue = 'pass_type',
alpha = 0.8, dodge=False)
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (miles)\n', fontsize = 14)
plt.xlabel('\nPass type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend('', frameon=False, fancybox=False)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
walkup_mode = distance_lim_3.query(' pass_type == "Walk-up" ').distance_miles.mode()[0]
oneday_mode = distance_lim_3.query(' pass_type == "One Day" ').distance_miles.mode()[0]
monthly_mode = distance_lim_3.query(' pass_type == "Monthly" ').distance_miles.mode()[0]
flex_mode = distance_lim_3.query(' pass_type == "Flex" ').distance_miles.mode()[0]
annual_mode = distance_lim_3.query(' pass_type == "Annual" ').distance_miles.mode()[0]
heights = [walkup_mode, oneday_mode, monthly_mode, flex_mode, annual_mode]
labels = distance_lim_3.pass_type.sort_values(ascending=True).unique()
ax2 = sb.barplot(x = labels, y = heights, hue = labels, alpha = 0.8, dodge=False)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1))
# add annotations
# -------------------------------------------------------
for p in ax2.patches:
ax2.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# increase first plot's y axis limit if second plot's y axis limit is higher
if ax1.get_ylim() < ax2.get_ylim():
ax1.set_ylim(ax2.get_ylim())
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of trip distances under 3 miles based on pass type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.25.g Assessment of trip distances under 3 miles based on pass type.png', dpi=300, bbox_inches='tight')
distance_miles and fare_type columns:¶Columns: distance_miles, fare_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted Barchart# Observe the distribution of distances through descriptive statistics
bikeshare.distance_miles.describe()
Use the above descriptive statistics to set the
bin sizefor the upcoming plots
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.6)
base_color = sb.color_palette()[0]
g = sb.FacetGrid(data = bikeshare, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1 )
g.map(plt.hist, "distance_miles", color = base_color, bins = np.arange(0, bikeshare.distance_miles.max()+1, 1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of all trip distances based on fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDuration (minutes)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.26.a Facet Grid of trip distances on fare type.png', dpi=300, bbox_inches='tight')
The distribution of distance_miles values are right skewed and needs closer interpretation. This can be achieved by limiting the plot to a threshold value.
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'fare_type', y = 'distance_miles', inner = 'quartile',
color = base_color)
plt.title('Trip distances - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'fare_type', y = 'distance_miles', color = base_color)
plt.title('Trip distances - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "fare_type", y = "distance_miles", alpha = 0.5, color = base_color)
plt.title('Trip distances - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.26.b Distribution of Fare type durations on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip distance and the most frequent trip distance subjected to each fare type.
plt.figure(figsize = [12, 5])
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = bikeshare, x = "fare_type", y = "distance_miles", hue = 'fare_type', alpha = 0.8, dodge=False)
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (miles)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend('', frameon=False, fancybox=False)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
base_mode = bikeshare.query(' fare_type == "Base" ').distance_miles.mode()[0]
extended_mode = bikeshare.query(' fare_type == "Extended" ').distance_miles.mode()[0]
heights = [base_mode, extended_mode]
labels = bikeshare.fare_type.sort_values(ascending=True).unique()
ax2 = sb.barplot(x = labels, y = heights, hue = labels, alpha = 0.8, dodge=False)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1))
# add annotations
# -------------------------------------------------------
for p in ax2.patches:
ax2.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# adjust the two plots to have the same y axis limits
if ax1.get_ylim() < ax2.get_ylim():
ax1.set_ylim(ax2.get_ylim())
else:
ax2.set_ylim(ax1.get_ylim())
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all trip distances based on fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.26.c Average trip distances based on fare type.png', dpi=300, bbox_inches='tight')
The
Round Tripentries have a distance/displacement equal toZeroand clustered together unlikeOne Waytrips which are distributed between1-25 miles. Hence remove the entries with duration value0and re-evaluate the descriptive statistics, to calculate the correct mode values.
drop_index = bikeshare.query(' distance_miles == 0 ').index
temp_df = bikeshare.drop(drop_index)
plt.figure(figsize = [12, 5])
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = temp_df, x = "fare_type", y = "distance_miles", hue = 'fare_type', alpha = 0.8, dodge=False)
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (miles)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend('', frameon=False, fancybox=False)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
base_mode = temp_df.query(' fare_type == "Base" ').distance_miles.mode()[0]
extended_mode = temp_df.query(' fare_type == "Extended" ').distance_miles.mode()[0]
heights = [base_mode, extended_mode]
labels = temp_df.fare_type.sort_values(ascending=True).unique()
ax2 = sb.barplot(x = labels, y = heights, hue = labels, alpha = 0.8, dodge=False)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in ax2.patches:
ax2.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# adjust the two plots to have the same y axis limits
if ax1.get_ylim() < ax2.get_ylim():
ax1.set_ylim(ax2.get_ylim())
else:
ax2.set_ylim(ax1.get_ylim())
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of all trip distances based on fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.26.d Assessment of all trip distances based on fare type.png', dpi=300, bbox_inches='tight')
Dataset limited by 3 miles:
# calculate the percentage of the dataset that falls under `3 miles` trip distance.
np.round((bikeshare.query(' distance_miles <= 3 ').shape[0]/bikeshare.shape[0])*100, 2)
The calculations are influenced by the presence of outliers. Calculate the descriptive statistics by limiting the data set to entries under 3 miles, which constitute the 99% of the distance distribution. Also remove the entries with distance valued Zero as they are clustered by Round Trip data and influence the actual statistics.
# calculate the percentage of the dataset that falls under `3 miles` trip distance and above '0 miles'.
np.round((bikeshare.query(' distance_miles <= 3 and distance_miles > 0 ').shape[0]/bikeshare.shape[0])*100, 2)
# Limit the dataset that has entries under 3 miles distance and above '0' miles
distance_lim_3 = bikeshare.query(' distance_miles <= 3 and distance_miles > 0 ')
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
g = sb.FacetGrid(data = distance_lim_3, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1, xlim=(0,3))
g.map(plt.hist, "distance_miles", color = base_color, bins = np.arange(0, bikeshare.distance_miles.max()+0.1, 0.1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of trip distances under 3 miles based on fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDistance (miles)', size = 14)
g.set_ylabels('Rentals (thousands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.26.e Facet Grid of trip distances under 3 miles on bike type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
sb.set_style('darkgrid')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = distance_lim_3, x = 'fare_type', y = 'distance_miles', inner = 'quartile',
color = base_color)
plt.title('Trip distances - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = distance_lim_3, x = 'fare_type', y = 'distance_miles', color = base_color)
plt.title('Trip distances - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = distance_lim_3, x = "fare_type", y = "distance_miles", alpha = 0.5, color = base_color)
plt.title('Trip distances - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.26.f Distribution of Fare type durations under 3 miles on various plots.png', dpi=300, bbox_inches='tight')
Calculate the average trip distance and the most frequent trip distance subjected to each fare type.
plt.figure(figsize = [12, 5])
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
# left plot: point plot - Avg trip duration
sb.set_style('white')
plt.subplot(1, 2, 1)
ax1 = sb.barplot(data = distance_lim_3, x = "fare_type", y = "distance_miles", hue = 'fare_type',
alpha = 0.8, dodge=False)
plt.title('Avg. Trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.ylabel('Avg. distance (miles)\n', fontsize = 14)
plt.xlabel('\nFare type', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend('', frameon=False, fancybox=False)
# add annotations
# -------------------------------------------------------
for p in ax1.patches:
ax1.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# right plot: Bar chart - most frequent trip duration
sb.set_style('white')
plt.subplot(1, 2, 2)
base_mode = distance_lim_3.query(' fare_type == "Base" ').distance_miles.mode()[0]
extended_mode = distance_lim_3.query(' fare_type == "Extended" ').distance_miles.mode()[0]
heights = [base_mode, extended_mode]
labels = distance_lim_3.fare_type.sort_values(ascending=True).unique()
ax2 = sb.barplot(x = labels, y = heights, hue = labels, alpha = 0.8, dodge=False)
plt.title('Most frequent trip distance\n\n', weight = 'bold', fontsize = 16, color = 'dimgrey')
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Distance (miles)\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize=12)
sb.despine(top=True, bottom=False, left=False, right=True);
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5, ncol = 1,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# add annotations
# -------------------------------------------------------
for p in ax2.patches:
ax2.annotate(format(p.get_height(), '.1f'), (p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize = 12)
# -------------------------------------------------------
# adjust the two plots to have the same y axis limits
if ax1.get_ylim() < ax2.get_ylim():
ax1.set_ylim(ax2.get_ylim())
else:
ax2.set_ylim(ax1.get_ylim())
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.subplots_adjust(top=0.7)
plt.suptitle('Assessment of trip distances under 3 miles based on fare type\n', fontsize = 16, weight = 'bold');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.26.g Assessment of trip distances under 3 miles based on fare type.png', dpi=300, bbox_inches='tight')
bikeshare.hour and trip_type columns:¶Columns: hour, trip_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
sb.set_style('white')
flatui = ["lightskyblue", "cornflowerblue"]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [8, 6])
cat1_order = bikeshare.hour.unique()
cat2_order = bikeshare.trip_type.unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['trip_type'] == cat2]['hour'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['hour']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated hourly rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\n Hour of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.27.a Stacked plot of trip rentals based on hour of the day.png', dpi=300, bbox_inches='tight')
Line Plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['trip_type'],
bikeshare['hour']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_palette(palette = "colorblind", n_colors = 2, desat = 0.8)
current_palette = sb.color_palette()
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "hour", y = "rentals", hue="trip_type", style="trip_type", err_style="bars")
plt.title('Aggregated hourly rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\n Hour of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# when a hue is used that appears as a legend title,
# remove this title, and replace it with custom title.
legend = ax.legend()
legend.texts[0].set_text("Trip type");
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.27.b Line plot of trip rentals based on hour of the day.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of hourly rentals over trip type:
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1.2)
g.map(plt.hist, "hour", color = 'lightskyblue', bins = np.arange(0, bikeshare.hour.max()+1, 1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Aggregated Hourly distribution of bike rentals categorized by trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.27.c Facet Grid of trip rentals based on hour of the day.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
base_color = 'lightskyblue'
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'trip_type', y = 'hour', inner = 'quartile',
color = base_color)
plt.title('Trip rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Hour\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'trip_type', y = 'hour', color = base_color)
plt.title('Trip rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "trip_type", y = "hour", alpha = 0.002, color = base_color)
plt.title('Trip rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.27.d Distribution of Trip rentals based on hour of the day.png', dpi=300, bbox_inches='tight')
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'trip_type', columns = 'hour', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [18, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="10", va="center")
plt.title('Hourly distribution of rentals based on trip type\n', fontsize = 14, weight = 'bold')
plt.xlabel('\nHour of the day', fontsize=12)
plt.ylabel('Trip type\n', fontsize=12);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1 %')
else:
t.set_text('{} %'.format(p));
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.27.e Heat map of Trip rentals based on hour of the day.png', dpi=300, bbox_inches='tight')
Find average rentals based on the hour of the day:
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["trip_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df['rentals'] = hours_df['rentals'].fillna(0).astype(int)
hours_df.head(10)
plt.figure(figsize=[12,4])
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
sb.pointplot(data = hours_df, x = "hour", y = "rentals", linestyles = "-", hue = 'trip_type')
plt.title('Average hourly bike rentals categorized by trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nHour of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+10, 10)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
avg_rentals = hours_df.groupby([hours_df["trip_type"], hours_df["hour"]]).mean()['rentals'].reset_index()
avg_rentals_max = avg_rentals.rentals.max()
avg_rentals_oneway = avg_rentals.query(' trip_type == "One Way" ')
avg_rentals_roundtrip = avg_rentals.query(' trip_type == "Round Trip" ')
# get the current tick locations and labels
locs, labels = plt.xticks()
for categorical_df in [avg_rentals_oneway, avg_rentals_roundtrip]:
clrs = ['gold' if trip == "One Way" else 'limegreen' for trip in categorical_df.trip_type ]
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, categorical_df.rentals, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f}'.format(count)
if categorical_df.trip_type.unique() == "One Way":
indent = 10
else:
indent = 5
# print the annotation depending on the bar length
plt.text(loc, count + indent, pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
plt.savefig('plots/3.2.27.f Average hourly bike rentals categorized by trip type.png', dpi=300, bbox_inches='tight')
plt.figure(figsize=[8,5])
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
sb.pointplot(data = hours_df, x = "hour", y = "rentals", linestyles = "--", hue = 'trip_type')
plt.title('Average hourly bike rentals categorized by trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nHour of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
hours_rental_avg_max = 70
y_tick_values = np.arange(0, hours_rental_avg_max+10, 10)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.2)
plt.savefig('plots/3.2.27.g Average hourly bike rentals categorized by trip type.png', dpi=300, bbox_inches='tight')
bikeshare.hour and bike_type columns:¶Columns: hour, bike_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [8, 6])
cat1_order = bikeshare.hour.sort_values(ascending=True).unique()
cat2_order = bikeshare.bike_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['bike_type'] == cat2]['hour'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['hour']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated hourly rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\n Hour of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.28.a Stack plot of rentals based on bike type and hour of the day.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['hour']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
# flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "hour", y = "rentals", hue="bike_type", linewidth=3)
plt.title('Aggregated hourly rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\n Hour of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
legend = ax.legend()
legend.texts[0].set_text("Bike type");
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.28.b Line plot of Trip rentals based on hour of the day over bike type.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of hourly rentals over bike type:
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1.2)
g.map(plt.hist, "hour", color = 'lightskyblue', bins = np.arange(0, bikeshare.hour.max()+2, 1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Aggregated Hourly distribution of bike rentals categorized by bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.28.c Facet Grid of trip rentals based on hour of the day over bike type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
base_color = 'lightskyblue'
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'bike_type', y = 'hour', inner = 'quartile',
color = base_color)
plt.title('Trip rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Hour\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'bike_type', y = 'hour', color = base_color)
plt.title('Trip rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "bike_type", y = "hour", alpha = 0.002, color = base_color)
plt.title('Trip rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.28.d Distribution of Trip rentals based on hour of the day over bike type.png', dpi=300, bbox_inches='tight')
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'bike_type', columns = 'hour', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [18, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Hourly distribution of bike rentals based on bike type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nHour of the day', fontsize=14)
plt.ylabel('Bike type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.28.e Heat map of Trip rentals based on hour of the day over bike type.png', dpi=300, bbox_inches='tight')
Find average rentals based on the hour of the day:
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["bike_type"]]).size().rename_axis(['year',
'month',
'day',
'hour',
'bike_type']).reset_index(name='rentals')
hours_df.head()
hours_df[-10:]
The
bike_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.This can be observed in the above dataframe that the entries subjected to2017does not havebike_typeof eitherStandard/Electric/Smartwith respect to each hour.
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df['rentals'] = hours_df['rentals'].fillna(0).astype(int)
hours_df.head(10)
plt.figure(figsize=[8,5])
sb.set_style('white')
# flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
flatui = ['#ff7ddd', '#77f7cc', '#4b99eb', '#aa75fa']
sb.set_palette(flatui, n_colors=4, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = hours_df, x = "hour", y = "rentals", linestyles = "-", hue = 'bike_type',
scale = 1, ci = None)
plt.title('Average hourly bike rentals categorized by bike type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nHour of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
hours_rental_avg_max = 70
y_tick_values = np.arange(0, hours_rental_avg_max+10, 10)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.2)
plt.savefig('plots/3.2.28.f Average hourly bike rentals categorized by bike type.png', dpi=300, bbox_inches='tight')
bikeshare.hour and pass_type columns:¶Columns: hour, pass_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [8, 6])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
cat1_order = bikeshare.hour.sort_values(ascending=True).unique()
cat2_order = bikeshare.pass_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['pass_type'] == cat2]['hour'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['hour']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated hourly rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nHour of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.29.a Stack plot of rentals based on pass type and hour of the day.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['pass_type'],
bikeshare['hour']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "hour", y = "rentals", hue="pass_type", linewidth=3, alpha = 0.8)
plt.title('Aggregated hourly rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\n Hour of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
legend = ax.legend()
legend.texts[0].set_text("Pass type");
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.29.b Line plot of Trip rentals based on hour of the day over pass type.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of hourly rentals over bike type:
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1)
g.map(plt.hist, "hour", color = 'lightskyblue', bins = np.arange(0, bikeshare.hour.max()+2, 1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Aggregated Hourly distribution of bike rentals categorized by pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.29.c Facet Grid of trip rentals based on hour of the day over pass type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
sb.set_style('darkgrid')
base_color = 'lightskyblue'
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'pass_type', y = 'hour', inner = 'quartile',
color = base_color)
plt.title('Trip rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Hour\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'pass_type', y = 'hour', color = base_color)
plt.title('Trip rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "pass_type", y = "hour", alpha = 0.002, color = base_color)
plt.title('Trip rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.29.d Distribution of Trip rentals based on hour of the day over pass type.png', dpi=300, bbox_inches='tight')
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'pass_type', columns = 'hour', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [18, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Hourly distribution of bike rentals based on pass type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nHour of the day', fontsize=14)
plt.ylabel('Pass type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.29.d Heat map of Trip rentals based on hour of the day over pass type.png', dpi=300, bbox_inches='tight')
Find average rentals based on the hour of the day:
The
pass_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df['rentals'] = hours_df['rentals'].fillna(0).astype(int)
hours_df.head(10)
plt.figure(figsize=[8,5])
sb.set_style('white')
# flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
flatui = ["#34e0c7", "#c271e3", "#4cb1f5", "#e06458", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = hours_df, x = "hour", y = "rentals", linestyles = "-", hue = 'pass_type',
scale = 1, ci = None)
plt.title('Average hourly bike rentals categorized by pass type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nHour of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
hours_rental_avg_max = 70
y_tick_values = np.arange(0, hours_rental_avg_max+10, 10)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.2)
plt.savefig('plots/3.2.29.e Average hourly bike rentals categorized by pass type.png', dpi=300, bbox_inches='tight')
bikeshare.hour and fare_type columns:¶Columns: hour, fare_typeData type: (Numerical, continuous) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [8, 6])
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
cat1_order = bikeshare.hour.sort_values(ascending=True).unique()
cat2_order = bikeshare.fare_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['fare_type'] == cat2]['hour'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['hour']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated hourly rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nHour of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.30.a Stack plot of rentals based on fare type and hour of the day.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['hour']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "hour", y = "rentals", hue="fare_type", linewidth=3, alpha = 0.8)
plt.title('Aggregated hourly rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\n Hour of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
legend = ax.legend()
legend.texts[0].set_text("Fare type");
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.30.b Line plot of Trip rentals based on hour of the day over fare type.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of hourly rentals over bike type:
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1)
g.map(plt.hist, "hour", color = 'lightskyblue', bins = np.arange(0, bikeshare.hour.max()+2, 1))
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Aggregated Hourly distribution of bike rentals categorized by fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.30.c Facet Grid of trip rentals based on hour of the day over fare type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
base_color = 'lightskyblue'
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'fare_type', y = 'hour', inner = 'quartile',
color = base_color)
plt.title('Trip rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Hour\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'fare_type', y = 'hour', color = base_color)
plt.title('Trip rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "fare_type", y = "hour", alpha = 0.002, color = base_color)
plt.title('Trip rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.30.d Distribution of Trip rentals based on hour of the day over fare type.png', dpi=300, bbox_inches='tight')
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'fare_type', columns = 'hour', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [18, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Hourly distribution of bike rentals based on fare type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nHour of the day', fontsize=14)
plt.ylabel('Fare type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.30.e Heat map of Trip rentals based on hour of the day over fare type.png', dpi=300, bbox_inches='tight')
Find average rentals based on the hour of the day:
The
fare_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df['rentals'] = hours_df['rentals'].fillna(0).astype(int)
hours_df.head(10)
plt.figure(figsize=[8,5])
sb.set_style('white')
flatui = ['#577da1', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = hours_df, x = "hour", y = "rentals", linestyles = "-", hue = 'fare_type',
scale = 1, ci = None)
plt.title('Average hourly bike rentals categorized by fare type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nHour of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
hours_rental_avg_max = 70
y_tick_values = np.arange(0, hours_rental_avg_max+10, 10)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.2)
plt.savefig('plots/3.2.30.f Average hourly bike rentals categorized by fare type.png', dpi=300, bbox_inches='tight')
bikeshare.daytime and trip_type columns:¶Columns: daytime, trip_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
sb.set_style('white')
flatui = ["cyan", "blue"]
sb.set_palette(flatui, desat = 0.4)
plt.figure(figsize = [8, 6])
cat1_order = bikeshare.daytime.sort_values(ascending=True).unique()
cat2_order = bikeshare.trip_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['trip_type'] == cat2]['daytime'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha =0.6)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['daytime']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated daytime rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.31.a Stack plot of Trip rentals based on daytime over trip type.png', dpi=300, bbox_inches='tight')
Line Plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['trip_type'],
bikeshare['daytime']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
flatui = ["#78ffe0", "#9178ff"]
sb.set_palette(flatui, n_colors = 2, desat = 0.4)
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "daytime", y = "rentals", hue="trip_type",
style="trip_type", err_style="bars", linewidth=3)
plt.title('Aggregated daytime rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# when a hue is used that appears as a legend title,
# remove this title, and replace it with custom title.
legend = ax.legend()
legend.texts[0].set_text("Trip type");
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.31.b Line plot of Trip rentals based on daytime over trip type.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of daytime rentals over trip type:
# set the palette as per requirement
flatui = ["#78ffe0", "#9178ff"]
sb.set_palette(flatui, n_colors = 2, desat = 0.4)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1.2)
g.map(sb.countplot, "daytime", color = base_color, order = bikeshare.daytime.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Aggregated daytime distribution of bike rentals categorized by trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.31.c Facet grid of Trip rentals based on daytime over trip type.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'trip_type', columns = 'daytime', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [8, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Daytime distribution of rentals based on trip type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Trip type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.31.d Heat map of Trip rentals based on daytime over trip type.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
trip_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["trip_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df['rentals'] = daytime_df['rentals'].fillna(0).astype(int)
daytime_df.head(10)
plt.figure(figsize=[8, 5])
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
sb.pointplot(data = daytime_df, x = "daytime", y = "rentals", linestyles = ['-', '-'], hue = 'trip_type')
plt.title('Average daytime bike rentals categorized by trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nTime of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
daytime_rental_avg_max = 300
y_tick_values = np.arange(0, daytime_rental_avg_max+50, 50)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
avg_rentals = daytime_df.groupby([daytime_df["trip_type"], daytime_df["daytime"]]).mean()['rentals'].reset_index()
avg_rentals_max = avg_rentals.rentals.max()
avg_rentals_oneway = avg_rentals.query(' trip_type == "One Way" ')
avg_rentals_roundtrip = avg_rentals.query(' trip_type == "Round Trip" ')
# get the current tick locations and labels
locs, labels = plt.xticks()
for categorical_df in [avg_rentals_oneway, avg_rentals_roundtrip]:
clrs = ['gold' if trip == "One Way" else 'limegreen' for trip in categorical_df.trip_type ]
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, categorical_df.rentals, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f}'.format(count)
if categorical_df.trip_type.unique() == "One Way":
indent = 15
else:
indent = 15
# print the annotation depending on the bar length
plt.text(loc, count + indent, pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.31.e Average daytime bike rentals categorized by trip type.png', dpi=300, bbox_inches='tight')
plt.figure(figsize=[6,4])
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
sb.pointplot(data = daytime_df, x = "daytime", y = "rentals", linestyles = ['--', '--'], hue = 'trip_type')
plt.title('Average daytime bike rentals categorized by trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nTime of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
daytime_rental_avg_max = 300
y_tick_values = np.arange(0, daytime_rental_avg_max+50, 50)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.2)
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.31.f Average daytime bike rentals categorized by trip type.png', dpi=300, bbox_inches='tight')
bikeshare.daytime and bike_type columns:¶Columns: daytime, bike_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [8, 6])
cat1_order = bikeshare.daytime.sort_values(ascending=True).unique()
cat2_order = bikeshare.bike_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['bike_type'] == cat2]['daytime'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['daytime']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated daytime rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.32.a Stack plot of rentals based on bike type and daytime.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['daytime']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
# flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data = categorical_counts, x = "daytime", y = "rentals", hue="bike_type", linewidth=3)
plt.title('Aggregated daytime rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
legend = ax.legend()
legend.texts[0].set_text("Bike type");
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.32.b Line plot of rentals based on bike type and daytime.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of daytime rentals over bike type:
# set the palette as per requirement
flatui = ["#78ffe0", "#9178ff"]
sb.set_palette(flatui, n_colors = 2, desat = 0.4)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1.2)
g.map(sb.countplot, "daytime", color = base_color, order = bikeshare.daytime.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated daytime distribution of bike rentals categorized by bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.32.c Facet grid of rentals based on bike type and daytime.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'bike_type', columns = 'daytime', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [8, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Daytime distribution of bike rentals based on bike type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Bike type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.32.d Heat map of rentals based on bike type and daytime.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
bike_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df['rentals'] = daytime_df['rentals'].fillna(0).astype(int)
daytime_df.head(10)
plt.figure(figsize=[8,5])
sb.set_style('white')
# flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
flatui = ['#ff7ddd', '#77f7cc', '#4b99eb', '#aa75fa']
sb.set_palette(flatui, n_colors=4, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = daytime_df, x = "daytime", y = "rentals", linestyles = "-", hue = 'bike_type',
scale = 1, ci = None)
plt.title('Average daytime bike rentals categorized by bike type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nTime of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
daytime_rental_avg_max = 300
y_tick_values = np.arange(0, daytime_rental_avg_max+50, 50)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.2)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.32.e Average daytime bike rentals categorized by bike type.png', dpi=300, bbox_inches='tight')
bikeshare.daytime and pass_type columns:¶Columns: daytime, pass_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [8, 6])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
cat1_order = bikeshare.daytime.sort_values(ascending=True).unique()
cat2_order = bikeshare.pass_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['pass_type'] == cat2]['daytime'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['daytime']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated daytime rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.33.a Stack plot of rentals based on pass type and daytime.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['pass_type'],
bikeshare['daytime']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "daytime", y = "rentals", hue="pass_type", linewidth=3, alpha = 0.8)
plt.title('Aggregated daytime rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5, title_fontsize=12,
fontsize=10, facecolor='white', markerfirst=True, handlelength=2,
handletextpad=0.5, bbox_to_anchor=(1, 1)).texts[0].set_text("Pass type")
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.33.b Line plot of rentals based on pass type and daytime.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of daytime rentals over bike type:
# set the palette as per requirement
flatui = ["#78ffe0", "#9178ff"]
sb.set_palette(flatui, n_colors = 2, desat = 0.4)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1.2)
g.map(sb.countplot, "daytime", color = base_color, order = bikeshare.daytime.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Aggregated daytime distribution of bike rentals categorized by pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.33.c Facet grid of rentals based on pass type and daytime.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'pass_type', columns = 'daytime', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [8, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Daytime distribution of bike rentals based on pass type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Pass type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.33.d Heat map of rentals based on pass type and daytime.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
pass_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df['rentals'] = daytime_df['rentals'].fillna(0).astype(int)
daytime_df.head(10)
plt.figure(figsize=[8,5])
sb.set_style('white')
# flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
flatui = ["#34e0c7", "#c271e3", "#4cb1f5", "#e06458", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = daytime_df, x = "daytime", y = "rentals", linestyles = "-", hue = 'pass_type',
scale = 1, ci = None)
plt.title('Average daytime bike rentals categorized by pass type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nTime of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
daytime_rental_avg_max = 300
y_tick_values = np.arange(0, daytime_rental_avg_max+50, 50)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.2)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.33.e Average daytime bike rentals categorized by pass type.png', dpi=300, bbox_inches='tight')
bikeshare.daytime and fare_type columns:¶Columns: daytime, fare_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [8, 6])
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
cat1_order = bikeshare.daytime.sort_values(ascending=True).unique()
cat2_order = bikeshare.fare_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['fare_type'] == cat2]['daytime'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['daytime']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated daytime rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.34.a Stack plot of rentals based on fare type and daytime.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['daytime']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "daytime", y = "rentals", hue="fare_type", linewidth=3, alpha = 0.8)
plt.title('Aggregated daytime rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
legend = ax.legend()
legend.texts[0].set_text("Fare type");
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.34.b Line plot of rentals based on fare type and daytime.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of daytime rentals over bike type:
# set the palette as per requirement
flatui = ["#78ffe0", "#9178ff"]
sb.set_palette(flatui, n_colors = 2, desat = 0.4)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1.2)
g.map(sb.countplot, "daytime", color = base_color, order = bikeshare.daytime.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Aggregated daytime distribution of bike rentals categorized by fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.34.c Facet grid of rentals based on fare type and daytime.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'fare_type', columns = 'daytime', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [8, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Daytime distribution of bike rentals based on fare type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Fare type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.34.d Heat map of rentals based on fare type and daytime.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
fare_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df['rentals'] = daytime_df['rentals'].fillna(0).astype(int)
daytime_df.head(10)
plt.figure(figsize=[8,5])
sb.set_style('white')
flatui = ['#577da1', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = daytime_df, x = "daytime", y = "rentals", linestyles = "-", hue = 'fare_type',
scale = 1, ci = None)
plt.title('Average daytime bike rentals categorized by fare type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nTime of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
daytime_rental_avg_max = 300
y_tick_values = np.arange(0, daytime_rental_avg_max+50, 50)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.2)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.34.e Average daytime bike rentals categorized by fare type.png', dpi=300, bbox_inches='tight')
bikeshare.day and trip_type columns:¶Columns: day, trip_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
sb.set_style('white')
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = 1)
flatui = [sb.color_palette()[4], sb.color_palette()[6]]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [12, 5])
cat1_order = bikeshare.day.sort_values(ascending=True).unique()
cat2_order = bikeshare.trip_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['trip_type'] == cat2]['day'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha =0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['day']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated daily rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.15, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.35.a Stack plot of Trip rentals based on trip type and day of the month.png', dpi=300, bbox_inches='tight')
Line Plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['trip_type'],
bikeshare['day']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = 1)
flatui = [sb.color_palette()[4], sb.color_palette()[6]]
sb.set_palette(flatui, desat = 0.8)
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "day", y = "rentals", hue="trip_type",
style="trip_type", err_style="bars", linewidth=3)
plt.title('Aggregated daily rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when a hue is used that appears as a legend title,
# remove this title, and replace it with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("Bike type");
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.35.b Line plot of Trip rentals based on trip type and day of the month.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of daily rentals over trip type:
# set the palette as per requirement
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = 1)
flatui = [sb.color_palette()[4], sb.color_palette()[6]]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [12, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1.8)
g.map(sb.countplot, "day", color = base_color, order = bikeshare.day.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Aggregated daily distribution of bike rentals categorized by trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.35.c Facet grid of Trip rentals based on trip type and day of the month.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
# set the palette as per requirement
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = 1)
flatui = [sb.color_palette()[4], sb.color_palette()[6]]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'trip_type', y = 'day', inner = 'quartile',
color = base_color)
plt.title('Bike rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Day of the month\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'trip_type', y = 'day', color = base_color)
plt.title('Bike rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "trip_type", y = "day", alpha = 0.002, color = base_color)
plt.title('Bike rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.35.d Distribution of Trip rentals based on trip type and day of the month.png', dpi=300, bbox_inches='tight')
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'trip_type', columns = 'day', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [24, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Aggregated Daily distribution of rentals based on trip type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Trip type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.35.e Heat map of Trip rentals based on trip type and day of the month.png', dpi=300, bbox_inches='tight')
Find average rentals based on the day of the month:
The
trip_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
However, the above plots depicts that the rentals decrease during the end of the month, especially on 31'st of the month. The bike rentals are categorized over the day of the month, the distribution is calculated based on the cummulative summation of day over 3 years but not individual month. Hence, there are only 21 occurances of day 31st, while other days has an occurance of 36 over the time period of 3 years (2017-2019), except for days 29 and 30 which counts to 33 due to absence in February month. This denotes that the rate of rentals is actually high on 31st compared to other days.
Hence in order to compute the average daily rentals based on the occurances of the day in any month, apply
size()meathod instead ofcount()method to take missing dates into account.
day_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["trip_type"]]).size().reset_index(name='rentals')
day_df.head(10)
cat_order = day_df.day.sort_values(ascending=True).unique()
print('Day - Occurances')
day_df.day.value_counts()[cat_order]
plt.figure(figsize=[8, 5])
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
sb.pointplot(data = day_df, x = "day", y = "rentals", linestyles = ['-', '-'], hue = 'trip_type', ci = None)
plt.title('Average daily bike rentals categorized by trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the month', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
daily_rental_avg_max = 800
y_tick_values = np.arange(0, daily_rental_avg_max+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.4)
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.35.f Average daily bike rentals categorized by trip type.png', dpi=300, bbox_inches='tight')
bikeshare.day and bike_type columns:¶Columns: day, bike_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [10, 5])
cat1_order = bikeshare.day.sort_values(ascending=True).unique()
cat2_order = bikeshare.bike_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['bike_type'] == cat2]['day'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['day']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated daily rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.2, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.36.a Stack plot of rentals based on bike type and day of the month.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['day']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 5000, 5000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data = categorical_counts, x = "day", y = "rentals", hue="bike_type", linewidth=3)
plt.title('Aggregated daily rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1)).texts[0].set_text("Bike type");
# sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.36.b Line plot of rentals based on bike type and day of the month.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of daily rentals over bike type:
# set the palette as per requirement
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = 1)
flatui = [sb.color_palette()[4], sb.color_palette()[6]]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1.8)
g.map(sb.countplot, "day", color = base_color, order = bikeshare.day.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated daily distribution of bike rentals categorized by bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.36.c Facet grid of rentals based on bike type and day of the month.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
sb.set_style('darkgrid')
# set the palette as per requirement
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = 1)
flatui = [sb.color_palette()[4], sb.color_palette()[6]]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'bike_type', y = 'day', inner = 'quartile',
color = base_color)
plt.title('Bike rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Day of the month\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'bike_type', y = 'day', color = base_color)
plt.title('Bike rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "bike_type", y = "day", alpha = 0.002, color = base_color)
plt.title('Bike rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.36.d Distribution of Trip rentals based on bike type and day of the month.png', dpi=300, bbox_inches='tight')
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'bike_type', columns = 'day', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [24, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Daily distribution of bike rentals based on bike type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Bike type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.36.e Heat map of rentals based on bike type and day of the month.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
bike_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
However, the above plots depicts that the rentals decrease during the end of the month, especially on 31'st of the month. The bike rentals are categorized over the day of the month, the distribution is calculated based on the cummulative summation of day over 3 years but not individual month. Hence, there are only 21 occurances of day 31st, while other days has an occurance of 36 over the time period of 3 years (2017-2019), except for days 29 and 30 which counts to 33 due to absence in February month. This denotes that the rate of rentals is actually high on 31st compared to other days.
Hence in order to compute the average daily rentals based on the occurances of the day in any month, apply
size()meathod instead ofcount()method to take missing dates into account.
day_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["bike_type"]]).size().reset_index(name='rentals')
day_df.head(10)
cat_order = day_df.day.sort_values(ascending=True).unique()
print('Day - Occurances')
day_df.day.value_counts()[cat_order]
plt.figure(figsize=[8,5])
sb.set_style('white')
# flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
flatui = ['#ff7ddd', '#77f7cc', '#4b99eb', '#aa75fa']
sb.set_palette(flatui, n_colors=4, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = day_df, x = "day", y = "rentals", linestyles = "-", hue = 'bike_type',
scale = 1, ci = None)
plt.title('Average daily bike rentals categorized by bike type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the month', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
daily_rental_avg_max = 800
y_tick_values = np.arange(0, daily_rental_avg_max+50, 50)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
for loc in [100, 300, 500, 700]:
plt.axhline(loc, ls='--', color='black', linewidth=0.5, alpha=1)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.36.f Average daily bike rentals categorized by bike type.png', dpi=300, bbox_inches='tight')
bikeshare.day and pass_type columns:¶Columns: day, pass_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [10, 5])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
cat1_order = bikeshare.day.sort_values(ascending=True).unique()
cat2_order = bikeshare.pass_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['pass_type'] == cat2]['day'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['day']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated daily rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.37.a Stack plot of rentals based on pass type and day of the month.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['pass_type'],
bikeshare['day']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 5000, 5000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "day", y = "rentals", hue="pass_type", linewidth=3, alpha = 0.8)
plt.title('Aggregated daily rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, title_fontsize=12,
fontsize=10, facecolor='white', markerfirst=True, handlelength=2,
handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("Pass type");
# sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.37.b Line plot of rentals based on pass type and day of the month.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of daily rentals over bike type:
# set the palette as per requirement
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = 1)
flatui = [sb.color_palette()[4], sb.color_palette()[6]]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [18, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 2, height = 4, aspect = 1.8)
g.map(sb.countplot, "day", color = base_color, order = bikeshare.day.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated daily distribution of bike rentals categorized by pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDay of th month', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.37.c Facet grid of rentals based on pass type and day of the month.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [18, 4])
sb.set_style('darkgrid')
# set the palette as per requirement
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = 1)
flatui = [sb.color_palette()[4], sb.color_palette()[6]]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'pass_type', y = 'day', inner = 'quartile',
color = base_color)
plt.title('Bike rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Day of the month\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'pass_type', y = 'day', color = base_color)
plt.title('Bike rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "pass_type", y = "day", alpha = 0.002, color = base_color)
plt.title('Bike rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.37.d Distribution of Trip rentals based on pass type and day of the month.png', dpi=300, bbox_inches='tight')
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'pass_type', columns = 'day', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [24, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Daily distribution of bike rentals based on pass type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Pass type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.37.e Heat map of rentals based on pass type and day of the month.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
pass_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
However, the above plots depicts that the rentals decrease during the end of the month, especially on 31'st of the month. The bike rentals are categorized over the day of the month, the distribution is calculated based on the cummulative summation of day over 3 years but not individual month. Hence, there are only 21 occurances of day 31st, while other days has an occurance of 36 over the time period of 3 years (2017-2019), except for days 29 and 30 which counts to 33 due to absence in February month. This denotes that the rate of rentals is actually high on 31st compared to other days.
Hence in order to compute the average daily rentals based on the occurances of the day in any month, apply
size()meathod instead ofcount()method to take missing dates into account.
day_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["pass_type"]]).size().reset_index(name='rentals')
day_df.head(10)
cat_order = day_df.day.sort_values(ascending=True).unique()
print('Day - Occurances')
day_df.day.value_counts()[cat_order]
plt.figure(figsize=[8,5])
sb.set_style('white')
# flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
flatui = ["#34e0c7", "#c271e3", "#4cb1f5", "#e06458", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = day_df, x = "day", y = "rentals", linestyles = "-", hue = 'pass_type',
scale = 1, ci = None)
plt.title('Average daily bike rentals categorized by pass type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the month', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
daily_rental_avg_max = 800
y_tick_values = np.arange(0, daily_rental_avg_max+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.3)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.37.f Average daily bike rentals categorized by pass type.png', dpi=300, bbox_inches='tight')
bikeshare.day and fare_type columns:¶Columns: day, fare_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [10, 5])
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
cat1_order = bikeshare.day.sort_values(ascending=True).unique()
cat2_order = bikeshare.fare_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['fare_type'] == cat2]['day'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['day']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated daily rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.38.a Stack plot of rentals based on fare type and day of the month.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['day']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "day", y = "rentals", hue="fare_type", linewidth=3, alpha = 0.8)
plt.title('Aggregated daily rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("Fare type")
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.38.b Line plot of rentals based on fare type and day of the month.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of daytime rentals over bike type:
# set the palette as per requirement
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = 1)
flatui = [sb.color_palette()[4], sb.color_palette()[6]]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [18, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1.8)
g.map(sb.countplot, "day", color = base_color, order = bikeshare.day.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Aggregated daily distribution of bike rentals categorized by fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.38.c Facet grid of rentals based on fare type and day of the month.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
# set the palette as per requirement
sb.set_palette(palette = 'colorblind', n_colors = 10, desat = 1)
flatui = [sb.color_palette()[4], sb.color_palette()[6]]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'fare_type', y = 'day', inner = 'quartile',
color = base_color)
plt.title('Bike rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Hour\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'fare_type', y = 'day', color = base_color)
plt.title('Bike rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "fare_type", y = "day", alpha = 0.002, color = base_color)
plt.title('Bike rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.38.d Distribution of Trip rentals based fare type and day of the month.png', dpi=300, bbox_inches='tight')
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'fare_type', columns = 'day', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [24, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Daily distribution of bike rentals based on fare type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the month', fontsize=14)
plt.ylabel('Fare type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.38.e Heat map of rentals based on fare type and day of the month.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
fare_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
However, the above plots depicts that the rentals decrease during the end of the month, especially on 31'st of the month. The bike rentals are categorized over the day of the month, the distribution is calculated based on the cummulative summation of day over 3 years but not individual month. Hence, there are only 21 occurances of day 31st, while other days has an occurance of 36 over the time period of 3 years (2017-2019), except for days 29 and 30 which counts to 33 due to absence in February month. This denotes that the rate of rentals is actually high on 31st compared to other days.
Hence in order to compute the average daily rentals based on the occurances of the day in any month, apply
size()meathod instead ofcount()method to take missing dates into account.
day_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["fare_type"]]).size().reset_index(name='rentals')
day_df.head(10)
cat_order = day_df.day.sort_values(ascending=True).unique()
print('Day - Occurances')
day_df.day.value_counts()[cat_order]
plt.figure(figsize=[8,5])
sb.set_style('white')
flatui = ['#577da1', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = day_df, x = "day", y = "rentals", linestyles = "-", hue = 'fare_type',
scale = 1, ci = None)
plt.title('Average daily bike rentals categorized by fare type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the month', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
daily_rental_avg_max = 800
y_tick_values = np.arange(0, daily_rental_avg_max+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.25, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.3)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.38.f Average daily bike rentals categorized by fare type.png', dpi=300, bbox_inches='tight')
bikeshare.weekday and trip_type columns:¶Columns: weekday, trip_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
sb.set_style('white')
flatui = ["#37326b", "#6eacdb"]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [10, 6])
cat1_order = bikeshare.weekday.sort_values(ascending=True).unique()
cat2_order = bikeshare.trip_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['trip_type'] == cat2]['weekday'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha =0.6)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['weekday']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated weekday rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.2, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.39.a Stack plot of Trip rentals based on trip type and day of the week.png', dpi=300, bbox_inches='tight')
Line Plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['trip_type'],
bikeshare['weekday']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
flatui = ["#37326b", "#6eacdb"]
sb.set_palette(flatui, desat = 0.8)
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "weekday", y = "rentals", hue="trip_type",
style="trip_type", err_style="bars", linewidth=3, markers = ['o', 'o'])
plt.title('Aggregated weekday rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=10)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when a hue is used that appears as a legend title,
# remove this title, and replace it with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("");
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.39.b Line plot of Trip rentals based on trip type and day of the week.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of weekday rentals over trip type:
# set the palette as per requirement
flatui = ["#37326b", "#6eacdb"]
sb.set_palette(flatui, desat = 0.6)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 5, aspect = 1.2)
g.map(sb.countplot, "weekday", color = base_color, order = bikeshare.weekday.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Aggregated weekday distribution of bike rentals categorized by trip type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 10)
g.set_xticklabels(x_tick_names, size = 10)
g.set_xlabels('\nDay of the week', size = 12)
g.set_ylabels('Rentals (thoudands)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.39.c Facet grid of Trip rentals based on trip type and day of the week.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'trip_type', columns = 'weekday', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [10, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Weekday distribution of rentals based on trip type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Trip type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.39.d Heat map of Trip rentals based on trip type and day of the week.png', dpi=300, bbox_inches='tight')
Find average rentals based on the weekday:
The
trip_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
Every fifth week in a month won't have all the occurances of the
weekdayas the month's are limited by uneven equal distribution of 7 day span (number of days in a week). Hence in order to accurately calculate theaverage rentalsof theweekday, usesize()method, which takes only the unique combinations in the occurances and ignores occurances withNULL values.
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["trip_type"]]).size().reset_index(name='rentals')
weekday_df.head(10)
cat_order = weekday_df.weekday.sort_values(ascending=True).unique()
print('Weekday - Occurances')
weekday_df.weekday.value_counts()[cat_order]
plt.figure(figsize=[8, 5])
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
sb.pointplot(data = weekday_df, x = "weekday", y = "rentals", linestyles = ['-', '-'],
hue = 'trip_type', ci = None)
plt.title('Average weekday bike rentals categorized by trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the week', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
weekday_rental_avg_max = 800
y_tick_values = np.arange(0, weekday_rental_avg_max+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
avg_rentals = weekday_df.groupby([weekday_df["trip_type"], weekday_df["weekday"]]).mean()['rentals'].reset_index()
avg_rentals_max = avg_rentals.rentals.max()
avg_rentals_oneway = avg_rentals.query(' trip_type == "One Way" ')
avg_rentals_roundtrip = avg_rentals.query(' trip_type == "Round Trip" ')
# get the current tick locations and labels
locs, labels = plt.xticks()
for categorical_df in [avg_rentals_oneway, avg_rentals_roundtrip]:
clrs = ['salmon' if trip == "One Way" else 'limegreen' for trip in categorical_df.trip_type ]
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, categorical_df.rentals, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f}'.format(count)
if categorical_df.trip_type.unique() == "One Way":
indent = 40
else:
indent = 40
# print the annotation depending on the bar length
plt.text(loc, count + indent, pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.25, 1))
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.3)
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.39.e Average weekday bike rentals categorized by trip type.png', dpi=300, bbox_inches='tight')
bikeshare.weekday and bike_type columns:¶Columns: weekday, bike_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [8, 6])
cat1_order = bikeshare.weekday.sort_values(ascending=True).unique()
cat2_order = bikeshare.bike_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['bike_type'] == cat2]['weekday'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['weekday']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated weekday rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.2, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.40.a Stack plot of Trip rentals based on bike type and day of the week.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['weekday']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
# flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.6)
plt.figure(figsize = [7, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data = categorical_counts, x = "weekday", y = "rentals", hue="bike_type",
style="bike_type", err_style="bars", linewidth=3, markers = ['o', 'o', 'o', 'o'])
plt.title('Aggregated weekday rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1)).texts[0].set_text("");
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.40.b Line plot of Trip rentals based on bike type and day of the week.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of weekday rentals over bike type:
# set the palette as per requirement
flatui = ["#37326b", "#6eacdb"]
sb.set_palette(flatui, desat = 0.6)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1.2)
g.map(sb.countplot, "weekday", color = base_color, order = bikeshare.weekday.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated weekday distribution of bike rentals categorized by bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 10)
g.set_xticklabels(x_tick_names, rotation = 30, size = 10)
g.set_xlabels('\nDay of the weekday', size = 12)
g.set_ylabels('Rentals (thoudands)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.40.c Facet grid of Trip rentals based on bike type and day of the week.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'bike_type', columns = 'weekday', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [10, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Weekday distribution of bike rentals based on bike type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Bike type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.40.d Heat map of Trip rentals based on bike type and day of the week.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
bike_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
Every fifth week in a month won't have all the occurances of the
weekdayas the month's are limited by uneven equal distribution of 7 day span (number of days in a week). Hence in order to accurately calculate theaverage rentalsof theweekday, usesize()method, which takes only the unique combinations in the occurances and ignores occurances withNULL values.
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["bike_type"]]).size().reset_index(name='rentals')
weekday_df.head(10)
cat_order = weekday_df.weekday.sort_values(ascending=True).unique()
print('Weekday - Occurances')
weekday_df.weekday.value_counts()[cat_order]
def assign_clr(bike):
if (bike == "unknown"): return sb.color_palette()[0]
elif (bike == "Standard"): return sb.color_palette()[1]
elif (bike == "Electric"): return sb.color_palette()[2]
elif (bike == "Smart"): return sb.color_palette()[3]
return 'gold'
plt.figure(figsize=[8,5])
sb.set_style('white')
# flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
flatui = ['#ff7ddd', '#77f7cc', '#4b99eb', '#aa75fa']
sb.set_palette(flatui, n_colors=4, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = weekday_df, x = "weekday", y = "rentals", linestyles = "-", hue = 'bike_type',
scale = 1, ci = None)
plt.title('Average weekday bike rentals categorized by bike type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the week', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
weekday_rental_avg_max = 800
y_tick_values = np.arange(0, weekday_rental_avg_max+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
avg_rentals = weekday_df.groupby([weekday_df["bike_type"], weekday_df["weekday"]]).mean()['rentals'].reset_index()
avg_rentals_max = avg_rentals.rentals.max()
avg_rentals_unknown = avg_rentals.query(' bike_type == "unknown" ')
avg_rentals_standard = avg_rentals.query(' bike_type == "Standard" ')
avg_rentals_electric = avg_rentals.query(' bike_type == "Electric" ')
avg_rentals_smart = avg_rentals.query(' bike_type == "Smart" ')
# get the current tick locations and labels
locs, labels = plt.xticks()
for categorical_df in [avg_rentals_unknown, avg_rentals_standard, avg_rentals_electric, avg_rentals_smart]:
clrs = [assign_clr(bike) for bike in categorical_df.bike_type]
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, categorical_df.rentals, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f}'.format(count)
indent = 40
# print the annotation depending on the bar length
plt.text(loc, count + indent, pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.05, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.40.e Average weekday bike rentals categorized by bike type.png', dpi=300, bbox_inches='tight')
def assign_clr(bike):
if (bike == "unknown"): return sb.color_palette()[0]
elif (bike == "Standard"): return sb.color_palette()[1]
elif (bike == "Electric"): return sb.color_palette()[2]
elif (bike == "Smart"): return sb.color_palette()[3]
return 'gold'
plt.figure(figsize=[8,5])
sb.set_style('white')
# flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
flatui = ['#ff7ddd', '#77f7cc', '#4b99eb', '#aa75fa']
sb.set_palette(flatui, n_colors=4, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = weekday_df, x = "weekday", y = "rentals", linestyles = "-", hue = 'bike_type',
scale = 1, ci = None)
plt.title('Average weekday bike rentals categorized by bike type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the week', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
weekday_rental_avg_max = 800
y_tick_values = np.arange(0, weekday_rental_avg_max+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.40.f Average weekday bike rentals categorized by bike type.png', dpi=300, bbox_inches='tight')
bikeshare.weekday and pass_type columns:¶Columns: weekday, pass_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [8, 6])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
cat1_order = bikeshare.weekday.sort_values(ascending=True).unique()
cat2_order = bikeshare.pass_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['pass_type'] == cat2]['weekday'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['weekday']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated weekday rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.25, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.41.a Stack plot of Trip rentals based on pass type and day of the week.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['pass_type'],
bikeshare['weekday']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [7, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "weekday", y = "rentals", hue="pass_type",
alpha = 0.8, err_style="bars", linewidth=3)
plt.title('Aggregated weekday rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, title = "Pass type",
title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True, handlelength=2,
handletextpad=0.5, bbox_to_anchor=(1.3, 1)).texts[0].set_text("")
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.41.b Line plot of Trip rentals based on pass type and day of the week.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of weekday rentals over pass type:
# set the palette as per requirement
flatui = ["#37326b", "#6eacdb"]
sb.set_palette(flatui, desat = 0.6)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1)
g.map(sb.countplot, "weekday", color = base_color, order = bikeshare.weekday.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated weekday distribution of bike rentals categorized by pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 10)
g.set_xticklabels(x_tick_names, rotation = 30, size = 10)
g.set_xlabels('\nDay of the weekday', size = 12)
g.set_ylabels('Rentals (thoudands)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.41.c Facet grid of Trip rentals based on pass type and day of the week.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'pass_type', columns = 'weekday', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [10, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Weekday distribution of bike rentals based on pass type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Pass type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.41.d Heat map of Trip rentals based on pass type and day of the week.png', dpi=300, bbox_inches='tight')
Find average rentals based on the weekday:
The
pass_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
Every fifth week in a month won't have all the occurances of the
weekdayas the month's are limited by uneven equal distribution of 7 day span (number of days in a week). Hence in order to accurately calculate theaverage rentalsof theweekday, usesize()method, which takes only the unique combinations in the occurances and ignores occurances withNULL values.
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["pass_type"]]).size().reset_index(name='rentals')
weekday_df.head(10)
cat_order = weekday_df.weekday.sort_values(ascending=True).unique()
print('Weekday - Occurances')
weekday_df.weekday.value_counts()[cat_order]
def assign_clr(pass_type):
if (pass_type == "Walk-up"): return sb.color_palette()[0]
elif (pass_type == "One Day"): return sb.color_palette()[1]
elif (pass_type == "Monthly"): return sb.color_palette()[2]
elif (pass_type == "Flex"): return sb.color_palette()[3]
elif (pass_type == "Annual"): return sb.color_palette()[4]
return 'gold'
plt.figure(figsize=[8,5])
sb.set_style('white')
flatui = ["#34e0c7", "#c271e3", "#4cb1f5", "#e06458", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = weekday_df, x = "weekday", y = "rentals", linestyles = "-", hue = 'pass_type',
scale = 1, ci = None)
plt.title('Average weekday bike rentals categorized by pass type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the week', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
weekday_rental_avg_max = 800
y_tick_values = np.arange(0, weekday_rental_avg_max+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.41.e Average weekday bike rentals categorized by pass type.png', dpi=300, bbox_inches='tight')
bikeshare.weekday and fare_type columns:¶Columns: weekday, fare_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [8, 6])
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
cat1_order = bikeshare.weekday.sort_values(ascending=True).unique()
cat2_order = bikeshare.fare_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['fare_type'] == cat2]['weekday'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['weekday']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated weekday rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.2, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.42.a Stack plot of rentals based on fare type and day of the week.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['weekday']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "weekday", y = "rentals", hue="fare_type",
alpha = 0.8, err_style="bars", linewidth=3)
plt.title('Aggregated daytime rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(rotation = 30, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nTime of the day', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("");
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.42.b Line plot of rentals based on fare type and day of the week.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of weekday rentals over fare type:
# set the palette as per requirement
flatui = ["#37326b", "#6eacdb"]
sb.set_palette(flatui, desat = 0.6)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'fare_type', col_wrap = 2, height = 5, aspect = 1.2)
g.map(sb.countplot, "weekday", color = base_color, order = bikeshare.weekday.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Aggregated weekday distribution of bike rentals categorized by fare type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 10)
g.set_xticklabels(x_tick_names, size = 10)
g.set_xlabels('\nDay of the week', size = 12)
g.set_ylabels('Rentals (thoudands)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.42.c Facet grid of rentals based on fare type and day of the week.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'fare_type', columns = 'weekday', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [10, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Weekday distribution of bike rentals based on fare type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nDay of the week', fontsize=14)
plt.ylabel('Fare type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.42.d Heat map of rentals based on fare type and day of the week.png', dpi=300, bbox_inches='tight')
Find average rentals based on the weekday:
The
fare_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
Every fifth week in a month won't have all the occurances of the
weekdayas the month's are limited by uneven equal distribution of 7 day span (number of days in a week). Hence in order to accurately calculate theaverage rentalsof theweekday, usesize()method, which takes only the unique combinations in the occurances and ignores occurances withNULL values.
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["fare_type"]]).size().reset_index(name='rentals')
weekday_df.head(10)
cat_order = weekday_df.weekday.sort_values(ascending=True).unique()
print('Weekday - Occurances')
weekday_df.weekday.value_counts()[cat_order]
plt.figure(figsize=[8,5])
sb.set_style('white')
flatui = ['#577da1', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = weekday_df, x = "weekday", y = "rentals", linestyles = "-", hue = 'fare_type',
scale = 1, ci = None)
plt.title('Average weekday bike rentals categorized by fare type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the week', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
weekday_rental_avg_max = 800
y_tick_values = np.arange(0, weekday_rental_avg_max+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
avg_rentals = weekday_df.groupby([weekday_df["fare_type"], weekday_df["weekday"]]).mean()['rentals'].reset_index()
avg_rentals_max = avg_rentals.rentals.max()
avg_rentals_base = avg_rentals.query(' fare_type == "Base" ')
avg_rentals_extended = avg_rentals.query(' fare_type == "Extended" ')
# get the current tick locations and labels
locs, labels = plt.xticks()
for categorical_df in [avg_rentals_base, avg_rentals_extended]:
clrs = [sb.color_palette()[0] if fare == "Base" else sb.color_palette()[1] for fare in categorical_df.fare_type ]
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, categorical_df.rentals, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f}'.format(count)
indent = 40
# print the annotation depending on the bar length
plt.text(loc, count + indent, pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.25, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.42.e Average weekday bike rentals categorized by fare type.png', dpi=300, bbox_inches='tight')
Average weekday bike rentals categorized by fare type, that potray the incremental/decremental of average rentals compared to previous day.
def assign_clrs(counts):
clr_list = []
for i in range(len(counts)):
try:
if counts[i] > counts[i-1]:
clr_list.append('mediumseagreen')
else:
clr_list.append('salmon')
except KeyError:
clr_list.append('mediumseagreen')
return clr_list
plt.figure(figsize=[8,5])
sb.set_style('white')
flatui = ['#577da1', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = weekday_df, x = "weekday", y = "rentals", linestyles = "-", hue = 'fare_type',
scale = 1, ci = None)
plt.title('Average weekday bike rentals categorized by fare type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the week', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
weekday_rental_avg_max = 800
y_tick_values = np.arange(0, weekday_rental_avg_max+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
avg_rentals = weekday_df.groupby([weekday_df["fare_type"], weekday_df["weekday"]]).mean()['rentals'].reset_index()
avg_rentals_base = avg_rentals.query(' fare_type == "Base" ')
avg_rentals_extended = avg_rentals.query(' fare_type == "Extended" ')
# get the current tick locations and labels
locs, labels = plt.xticks()
for categorical_df in [avg_rentals_base, avg_rentals_extended]:
avg_rental_counts = list(categorical_df.rentals)
clrs = assign_clrs(avg_rental_counts)
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, categorical_df.rentals, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f}'.format(count)
indent = 40
# print the annotation depending on the bar length
plt.text(loc, count + indent, pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.25, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.42.f Average weekday bike rentals categorized by fare type.png', dpi=300, bbox_inches='tight')
bikeshare.month and trip_type columns:¶Columns: month, trip_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
sb.set_style('white')
flatui = ["slateblue", "mediumorchid"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
plt.figure(figsize = [8, 5])
cat1_order = bikeshare.month.sort_values(ascending=True).unique()
cat2_order = bikeshare.trip_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['trip_type'] == cat2]['month'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha =0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['month']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated monthly rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nMonth of teh year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.25, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.43.a Stack plot of Trip rentals based on trip type and month of the year.png', dpi=300, bbox_inches='tight')
Line Plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['trip_type'],
bikeshare['month']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
flatui = ["slateblue", "mediumorchid"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 20000, 20000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "month", y = "rentals", hue="trip_type",
style="trip_type", err_style="bars", linewidth=3)
plt.title('Aggregated monthly rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when a hue is used that appears as a legend title,
# remove this title, and replace it with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("Bike type");
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.43.b Line plot of Trip rentals based on trip type and month of the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of monthly rentals over trip type:
# set the palette as per requirement
flatui = ["slateblue", "mediumorchid"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1.2)
g.map(sb.countplot, "month", color = base_color, order = bikeshare.month.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Aggregated monthly distribution of bike rentals categorized by trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.43.c Facet grid of Trip rentals based on trip type and month of the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
# set the palette as per requirement
flatui = ["slateblue", "mediumorchid"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'trip_type', y = 'month', inner = 'quartile',
color = base_color)
plt.title('Bike rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Month of the year\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'trip_type', y = 'month', color = base_color)
plt.title('Bike rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "trip_type", y = "month", alpha = 0.002, color = base_color)
plt.title('Bike rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.43.d Distribution of Trip rentals based on trip type and month of the year.png', dpi=300, bbox_inches='tight')
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'trip_type', columns = 'month', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [12, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Aggregated monthly distribution of rentals based on trip type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Trip type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.43.e Heat map of Trip rentals based on trip type and month of the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on the day of the month:
The
trip_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["trip_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df['rentals'] = month_df['rentals'].fillna(0).astype(int)
month_df.head(10)
plt.figure(figsize=[8, 5])
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
sb.pointplot(data = month_df, x = "month", y = "rentals", linestyles = ['-', '-'], hue = 'trip_type', ci = None)
plt.title('Average monthly bike rentals categorized by trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nMonth of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
monthly_rental_avg_max = 35000
y_tick_values = np.arange(0, monthly_rental_avg_max+5000, 5000)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.4)
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.43.f Average monthly bike rentals categorized by trip type.png', dpi=300, bbox_inches='tight')
bikeshare.month and bike_type columns:¶Columns: month, bike_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [8, 5])
cat1_order = bikeshare.month.sort_values(ascending=True).unique()
cat2_order = bikeshare.bike_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['bike_type'] == cat2]['month'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['month']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated monthly rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.25, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.44.a Stack plot of rentals based on bike type and month of the year.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['month']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data = categorical_counts, x = "month", y = "rentals", hue="bike_type", linewidth=4,
style="bike_type", err_style="bars", markers = ['o', 'o', 'o', 'o'], markersize=6)
ax.lines[0].set_linestyle("-")
ax.lines[1].set_linestyle("-")
ax.lines[2].set_linestyle("-")
ax.lines[3].set_linestyle("-")
plt.title('Aggregated monthly rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# customize legend
leg = ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
leg_lines = leg.get_lines()
leg_lines[1].set_linestyle("-")
leg_lines[2].set_linestyle("-")
leg_lines[3].set_linestyle("-")
leg_lines[4].set_linestyle("-")
leg.texts[0].set_text("Bike type")
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.44.b Line plot of rentals based on bike type and month of the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of monthly rentals over bike type:
# set the palette as per requirement
flatui = ["slateblue", "mediumorchid"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'bike_type', col_wrap = 2, height = 3, aspect = 1.2)
g.map(sb.countplot, "month", color = base_color, order = bikeshare.month.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated monthly distribution of bike rentals categorized by bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.44.c Facet grid of rentals based on bike type and month of the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [16, 4])
sb.set_style('darkgrid')
# set the palette as per requirement
flatui = ["slateblue", "mediumorchid"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'bike_type', y = 'month', inner = 'quartile',
color = base_color)
plt.title('Bike rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('Month of the year\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'bike_type', y = 'month', color = base_color)
plt.title('Bike rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "bike_type", y = "month", alpha = 0.002, color = base_color)
plt.title('Bike rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nBike type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.44.d Distribution of Trip rentals based on bike type and month of the year.png', dpi=300, bbox_inches='tight')
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'bike_type', columns = 'month', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [12, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Monthly distribution of bike rentals based on bike type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Bike type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.44.e Heat map of rentals based on bike type and month of the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
bike_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df['rentals'] = month_df['rentals'].fillna(0).astype(int)
month_df.head(10)
plt.figure(figsize=[8,5])
sb.set_style('white')
# flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
flatui = ['#ff7ddd', '#77f7cc', '#4b99eb', '#aa75fa']
sb.set_palette(flatui, n_colors=4, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = month_df, x = "month", y = "rentals", linestyles = "-", hue = 'bike_type',
scale = 1, ci = None)
plt.title('Average monthly bike rentals categorized by bike type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nMonth of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
monthly_rental_avg_max = 35000
y_tick_values = np.arange(0, monthly_rental_avg_max+5000, 5000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.44.f Average monthly bike rentals categorized by bike type.png', dpi=300, bbox_inches='tight')
bikeshare.month and pass_type columns:¶Columns: month, pass_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [8, 5])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
cat1_order = bikeshare.month.sort_values(ascending=True).unique()
cat2_order = bikeshare.pass_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['pass_type'] == cat2]['month'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['month']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated monthly rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.45.a Stack plot of rentals based on pass type and month of the year.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['pass_type'],
bikeshare['month']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "month", y = "rentals", hue="pass_type", linewidth=3, alpha = 0.8)
plt.title('Aggregated monthly rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, title_fontsize=12,
fontsize=10, facecolor='white', markerfirst=True, handlelength=2,
handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("Pass type");
# sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.45.b Line plot of rentals based on pass type and month of the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of monthly rentals over bike type:
# set the palette as per requirement
flatui = ["slateblue", "mediumorchid"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [18, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1.2)
g.map(sb.countplot, "month", color = base_color, order = bikeshare.month.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated monthly distribution of bike rentals categorized by pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.45.c Facet grid of rentals based on pass type and month of the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [18, 4])
sb.set_style('darkgrid')
# set the palette as per requirement
flatui = ["slateblue", "mediumorchid"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'pass_type', y = 'month', inner = 'quartile',
color = base_color)
plt.title('Bike rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('Month of the year\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'pass_type', y = 'month', color = base_color)
plt.title('Bike rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "pass_type", y = "month", alpha = 0.002, color = base_color)
plt.title('Bike rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nPass type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.45.d Distribution of Trip rentals based on pass type and month of the year.png', dpi=300, bbox_inches='tight')
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'pass_type', columns = 'month', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [12, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Monthly distribution of bike rentals based on pass type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Pass type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.45.e Heat map of rentals based on pass type and month of the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
pass_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df['rentals'] = month_df['rentals'].fillna(0).astype(int)
month_df.head(10)
plt.figure(figsize=[8,5])
sb.set_style('white')
# flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
flatui = ["#34e0c7", "#c271e3", "#4cb1f5", "#e06458", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = month_df, x = "month", y = "rentals", linestyles = "-", hue = 'pass_type',
scale = 1, ci = None)
plt.title('Average monthly bike rentals categorized by pass type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nMonth of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
monthly_rental_avg_max = 35000
y_tick_values = np.arange(0, monthly_rental_avg_max+5000, 5000)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.3)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.45.f Average monthly bike rentals categorized by pass type.png', dpi=300, bbox_inches='tight')
bikeshare.month and fare_type columns:¶Columns: month, fare_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [8, 5])
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
cat1_order = bikeshare.month.sort_values(ascending=True).unique()
cat2_order = bikeshare.fare_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['fare_type'] == cat2]['month'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['month']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated monthly rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.2, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.46.a Stack plot of rentals based on fare type and day of the month.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['month']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "month", y = "rentals", hue="fare_type", linewidth=3, alpha = 0.8)
plt.title('Aggregated monthly rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("Fare type")
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.46.b Line plot of rentals based on fare type and day of the month.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of monthly rentals over bike type:
# set the palette as per requirement
flatui = ["slateblue", "mediumorchid"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [12, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1.2)
g.map(sb.countplot, "month", color = base_color, order = bikeshare.month.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Aggregated monthly distribution of bike rentals categorized by fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.46.c Facet grid of rentals based on fare type and day of the month.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
# set the palette as per requirement
flatui = ["slateblue", "mediumorchid"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
sb.violinplot(data = bikeshare, x = 'fare_type', y = 'month', inner = 'quartile',
color = base_color)
plt.title('Bike rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('Month of the year\n', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
sb.boxplot(data = bikeshare, x = 'fare_type', y = 'month', color = base_color)
plt.title('Bike rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
sb.stripplot(data = bikeshare, x = "fare_type", y = "month", alpha = 0.002, color = base_color)
plt.title('Bike rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nFare type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.46.d Distribution of Trip rentals based on fare type and month of the year.png', dpi=300, bbox_inches='tight')
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'fare_type', columns = 'month', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [12, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':10}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Monthly distribution of bike rentals based on fare type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nMonth of the year', fontsize=14)
plt.ylabel('Fare type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.46.e Heat map of rentals based on fare type and day of the month.png', dpi=300, bbox_inches='tight')
Find average rentals based on the daytime:
The
fare_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df['rentals'] = month_df['rentals'].fillna(0).astype(int)
month_df.head(10)
plt.figure(figsize=[7, 5])
sb.set_style('white')
flatui = ['#577da1', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = month_df, x = "month", y = "rentals", linestyles = "-", hue = 'fare_type',
scale = 1, ci = None)
plt.title('Average monthly bike rentals categorized by fare type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nMonth of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
monthly_rental_avg_max = 35000
y_tick_values = np.arange(0, monthly_rental_avg_max+5000, 5000)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.3)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.46.f Average monthly bike rentals categorized by fare type.png', dpi=300, bbox_inches='tight')
bikeshare.quarter and trip_type columns:¶Columns: quarter, trip_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
sb.set_style('white')
flatui = ["#47b1c9", "#4c5e58"]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [7, 6])
cat1_order = bikeshare.quarter.sort_values(ascending=True).unique()
cat2_order = bikeshare.trip_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['trip_type'] == cat2]['quarter'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha =0.6)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['quarter']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated quarterly rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.47.a Stack plot of Trip rentals based on trip type and quarter of the year.png', dpi=300, bbox_inches='tight')
Line Plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['trip_type'],
bikeshare['quarter']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
flatui = ["#47b1c9", "#4c5e58"]
sb.set_palette(flatui, desat = 0.8)
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "quarter", y = "rentals", hue="trip_type",
style="trip_type", err_style="bars", linewidth=3, markers = ['o', 'o'])
plt.title('Aggregated quarterly rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=10)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when a hue is used that appears as a legend title,
# remove this title, and replace it with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("");
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.47.b Line plot of Trip rentals based on trip type and quarter of the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of quarterly rentals over trip type:
# set the palette as per requirement
flatui = ["#47b1c9", "#4c5e58"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1)
g.map(sb.countplot, "quarter", color = base_color, order = bikeshare.quarter.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Aggregated quarterly distribution of bike rentals categorized by trip type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 10)
g.set_xticklabels(x_tick_names, size = 10)
g.set_xlabels('\nQuarter of the year', size = 12)
g.set_ylabels('Rentals (thoudands)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.47.c Facet grid of Trip rentals based on trip type and quarter of the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'trip_type', columns = 'quarter', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [8, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Quarterly distribution of rentals based on trip type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Trip type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.47.d Heat map of Trip rentals based on trip type and quarter of the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on the quarter:
The
trip_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
quarter_df = bikeshare.groupby([bikeshare['year'],
bikeshare['quarter'],
bikeshare['trip_type']]).count()['trip_id'].reset_index(name='rentals')
quarter_df['rentals'] = quarter_df['rentals'].fillna(0).astype(int)
quarter_df.head(10)
plt.figure(figsize=[7, 5])
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
sb.pointplot(data = quarter_df, x = "quarter", y = "rentals", linestyles = ['-', '-'],
hue = 'trip_type', ci = None)
plt.title('Average quarterly bike rentals categorized by trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nQuarter of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
quarter_rental_avg_max = 100000
y_tick_values = np.arange(0, quarter_rental_avg_max+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
avg_rentals = quarter_df.groupby([quarter_df["trip_type"], quarter_df["quarter"]]).mean()['rentals'].reset_index()
avg_rentals_max = avg_rentals.rentals.max()
avg_rentals_oneway = avg_rentals.query(' trip_type == "One Way" ')
avg_rentals_roundtrip = avg_rentals.query(' trip_type == "Round Trip" ')
# get the current tick locations and labels
locs, labels = plt.xticks()
for categorical_df in [avg_rentals_oneway, avg_rentals_roundtrip]:
clrs = ['salmon' if trip == "One Way" else 'limegreen' for trip in categorical_df.trip_type ]
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, categorical_df.rentals, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} K'.format(count/1000)
if categorical_df.trip_type.unique() == "One Way":
indent = 5000
else:
indent = 5000
# print the annotation depending on the bar length
plt.text(loc, count + indent, pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.3)
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.47.e Average quarterly bike rentals categorized by trip type.png', dpi=300, bbox_inches='tight')
bikeshare.quarter and bike_type columns:¶Columns: quarter, bike_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [7, 6])
cat1_order = bikeshare.quarter.sort_values(ascending=True).unique()
cat2_order = bikeshare.bike_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['bike_type'] == cat2]['quarter'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['quarter']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated quarterly rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.25, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.48.a Stack plot of rentals based on bike type and quarter of the year.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['quarter']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
# flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.8)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 25000, 25000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data = categorical_counts, x = "quarter", y = "rentals", hue="bike_type", linewidth=3)
plt.title('Aggregated quarterly rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1)).texts[0].set_text("");
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.48.b Line plot of rentals based on bike type and quarter of the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of quarterly rentals over bike type:
# set the palette as per requirement
flatui = ["#47b1c9", "#4c5e58"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1)
g.map(sb.countplot, "quarter", color = base_color, order = bikeshare.quarter.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated quarterly distribution of bike rentals categorized by bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 10)
g.set_xticklabels(x_tick_names, size = 10)
g.set_xlabels('\nQuarter of the year', size = 12)
g.set_ylabels('Rentals (thoudands)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.48.c Facet grid of rentals based on bike type and quarter of the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'bike_type', columns = 'quarter', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [7, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Quarterly distribution of bike rentals based on bike type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Bike type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.48.d Heat map of rentals based on bike type and quarter of the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on the quarter of the year:
The
bike_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
quarter_df = bikeshare.groupby([bikeshare['year'],
bikeshare['quarter'],
bikeshare['bike_type']]).count()['trip_id'].reset_index(name='rentals')
quarter_df['rentals'] = quarter_df['rentals'].fillna(0).astype(int)
quarter_df.head(10)
plt.figure(figsize=[7, 5])
sb.set_style('white')
# flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
flatui = ['#ff7ddd', '#77f7cc', '#4b99eb', '#aa75fa']
sb.set_palette(flatui, n_colors=4, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = quarter_df, x = "quarter", y = "rentals", linestyles = "-", hue = 'bike_type',
scale = 1, ci = None)
plt.title('Average quarterly bike rentals categorized by bike type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nQuarter of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
quarterly_rental_avg_max = 100000
y_tick_values = np.arange(0, quarterly_rental_avg_max+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.48.e Average quarterly bike rentals categorized by bike type.png', dpi=300, bbox_inches='tight')
bikeshare.quarter and pass_type columns:¶Columns: quarter, pass_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [6, 5])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
cat1_order = bikeshare.quarter.sort_values(ascending=True).unique()
cat2_order = bikeshare.pass_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['pass_type'] == cat2]['quarter'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['quarter']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated quarterly rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.49.a Stack plot of rentals based on pass type and quarter of the year.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['pass_type'],
bikeshare['quarter']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "quarter", y = "rentals", hue="pass_type",
alpha = 0.8, err_style="bars", linewidth=3)
plt.title('Aggregated quarterly rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5, title = "Pass type",
title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True, handlelength=2,
handletextpad=0.5, bbox_to_anchor=(1.3, 1)).texts[0].set_text("");
# sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.49.b Line plot of rentals based on pass type and quarter of the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of quarterly rentals over pass type:
# set the palette as per requirement
flatui = ["#47b1c9", "#4c5e58"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1)
g.map(sb.countplot, "quarter", color = base_color, order = bikeshare.quarter.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated quarterly distribution of bike rentals categorized by pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 10)
g.set_xticklabels(x_tick_names, size = 10)
g.set_xlabels('\nQuarter of the year', size = 12)
g.set_ylabels('Rentals (thoudands)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.49.c Facet grid of rentals based on pass type and quarter of the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'pass_type', columns = 'quarter', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [6, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Quarterly distribution of bike rentals based on pass type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Pass type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.49.d Heat map of rentals based on pass type and quarter of the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on quarter of the year:
The
pass_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
quarter_df = bikeshare.groupby([bikeshare['year'],
bikeshare['quarter'],
bikeshare['pass_type']]).count()['trip_id'].reset_index(name='rentals')
quarter_df['rentals'] = quarter_df['rentals'].fillna(0).astype(int)
quarter_df.head(10)
plt.figure(figsize=[7,5])
sb.set_style('white')
flatui = ["#34e0c7", "#c271e3", "#4cb1f5", "#e06458", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = quarter_df, x = "quarter", y = "rentals", linestyles = "-", hue = 'pass_type',
scale = 1, ci = None)
plt.title('Average quarterly bike rentals categorized by pass type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nQuarter of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
weekday_rental_avg_max = 100000
y_tick_values = np.arange(0, weekday_rental_avg_max+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.49.e Average quarterly bike rentals categorized by pass type.png', dpi=300, bbox_inches='tight')
bikeshare.quarter and fare_type columns:¶Columns: quarter, fare
_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [6, 5])
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
cat1_order = bikeshare.quarter.sort_values(ascending=True).unique()
cat2_order = bikeshare.fare_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['fare_type'] == cat2]['quarter'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['quarter']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated quarterly rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.50.a Stack plot of rentals based on fare type and quarter of the year.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['quarter']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "quarter", y = "rentals", hue="fare_type",
alpha = 0.8, err_style="bars", linewidth=4)
plt.title('Aggregated quartely rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("");
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.50.b Line plot of rentals based on fare type and quarter of the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of weekday rentals over fare type:
# set the palette as per requirement
flatui = ["#47b1c9", "#4c5e58"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1)
g.map(sb.countplot, "quarter", color = base_color, order = bikeshare.quarter.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Aggregated quarterly distribution of bike rentals categorized by fare type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 10)
g.set_xticklabels(x_tick_names, size = 10)
g.set_xlabels('\nQuarter of the year', size = 12)
g.set_ylabels('Rentals (thoudands)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.50.c Facet grid of rentals based on fare type and quarter of the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'fare_type', columns = 'quarter', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [6, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Quarterly distribution of bike rentals based on fare type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nQuarter of the year', fontsize=14)
plt.ylabel('Fare type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.50.d Heat map of rentals based on fare type and quarter of the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on the quarter of the year:
The
fare_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
quarter_df = bikeshare.groupby([bikeshare['year'],
bikeshare['quarter'],
bikeshare['fare_type']]).count()['trip_id'].reset_index(name='rentals')
quarter_df['rentals'] = quarter_df['rentals'].fillna(0).astype(int)
quarter_df.head(10)
plt.figure(figsize=[7,5])
sb.set_style('white')
flatui = ['#577da1', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = quarter_df, x = "quarter", y = "rentals", linestyles = "-", hue = 'fare_type',
scale = 1, ci = None)
plt.title('Average quarterly bike rentals categorized by fare type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nQuarter of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
quarterly_rental_avg_max = 100000
y_tick_values = np.arange(0, quarterly_rental_avg_max+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
avg_rentals = quarter_df.groupby([quarter_df["fare_type"], quarter_df["quarter"]]).mean()['rentals'].reset_index()
avg_rentals_max = avg_rentals.rentals.max()
avg_rentals_base = avg_rentals.query(' fare_type == "Base" ')
avg_rentals_extended = avg_rentals.query(' fare_type == "Extended" ')
# get the current tick locations and labels
locs, labels = plt.xticks()
for categorical_df in [avg_rentals_base, avg_rentals_extended]:
clrs = [sb.color_palette()[0] if fare == "Base" else sb.color_palette()[1] for fare in categorical_df.fare_type ]
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, categorical_df.rentals, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} K'.format(count/1000)
indent = 5000
# print the annotation depending on the bar length
plt.text(loc, count + indent, pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.50.e Average quarterly bike rentals categorized by fare type.png', dpi=300, bbox_inches='tight')
bikeshare.year and trip_type columns:¶Columns: year, trip_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
sb.set_style('white')
flatui = ["#80324a", "#7b5f8a"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
plt.figure(figsize = [6, 5])
cat1_order = bikeshare.year.sort_values(ascending=True).unique()
cat2_order = bikeshare.trip_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['trip_type'] == cat2]['year'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha =0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['year']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated yearly rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.4, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.51.a Stack plot of Trip rentals based on trip type and the year.png', dpi=300, bbox_inches='tight')
Line Plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['trip_type'],
bikeshare['year']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
flatui = ["#80324a", "#7b5f8a"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "year", y = "rentals", hue="trip_type",
style="trip_type", err_style="bars", linewidth=3)
plt.title('Aggregated yearly rentals based on trip type\n', weight = 'bold', fontsize = 16)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# set custom xticks to avoid segmentation of continuous values of year (['2017.25', '2017.50', '2017.75, ....'])
# get alloted xticks, if decimal part equals to 0, set it as xtick value else skip the xtick value
xlabels = ['{:.0f}'.format(x) if divmod(x, 1)[1] == 0 else "" for x in ax.get_xticks()]
ax.set_xticklabels(xlabels)
plt.xticks(fontsize=12)
# when a hue is used that appears as a legend title,
# remove this title, and replace it with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("");
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.51.b Line plot of Trip rentals based on trip type and the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of yearly rentals over trip type:
# set the palette as per requirement
flatui = ["#80324a", "#7b5f8a"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1)
g.map(sb.countplot, "year", color = base_color, order = bikeshare.year.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Aggregated yearly distribution of bike rentals categorized by trip type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nYear', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.51.c Facet grid of Trip rentals based on trip type and the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
plt.figure(figsize = [14, 4])
sb.set_style('darkgrid')
# set the palette as per requirement
flatui = ["#80324a", "#7b5f8a"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
# left plot: violin plot
plt.subplot(1, 3, 1)
ax1 = sb.violinplot(data = bikeshare, x = 'trip_type', y = 'year', inner = 'quartile',
color = base_color)
plt.title('Bike rentals - Violin plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('Year\n', fontsize = 14)
plt.xticks(fontsize = 12)
# set custom yticks to avoid segmentation of continuous values of year (['2017.25', '2017.50', '2017.75, ....'])
# get alloted yticks, if decimal part equals to 0, set it as ytick value else skip the ytick value
ylabels = ['{:.0f}'.format(y) if divmod(y, 1)[1] == 0 else "" for y in ax1.get_yticks()]
ax1.set_yticklabels(ylabels)
plt.yticks(fontsize = 12)
# center plot: box plot
plt.subplot(1, 3, 2)
ax2 = sb.boxplot(data = bikeshare, x = 'trip_type', y = 'year', color = base_color)
plt.title('Bike rentals - Box plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14)
plt.xticks(fontsize = 12)
# set custom yticks to avoid segmentation of continuous values of year (['2017.25', '2017.50', '2017.75, ....'])
# get alloted yticks, if decimal part equals to 0, set it as ytick value else skip the ytick value
ylabels = ['{:.0f}'.format(y) if divmod(y, 1)[1] == 0 else "" for y in ax2.get_yticks()]
ax2.set_yticklabels(ylabels)
plt.yticks(fontsize = 12)
# right plot: strip chart
plt.subplot(1, 3, 3)
ax3 = sb.stripplot(data = bikeshare, x = "trip_type", y = "year", alpha = 0.002, color = base_color)
plt.title('Bike rentals - Strip plot\n', weight = 'bold', fontsize = 16)
plt.xlabel('\nTrip type', fontsize = 14)
plt.ylabel('', fontsize = 14);
plt.xticks(fontsize = 12)
# set custom yticks to avoid segmentation of continuous values of year (['2017.25', '2017.50', '2017.75, ....'])
# get alloted yticks, if decimal part equals to 0, set it as ytick value else skip the ytick value
ylabels = ['{:.0f}'.format(y) if divmod(y, 1)[1] == 0 else "" for y in ax3.get_yticks()]
ax3.set_yticklabels(ylabels)
plt.yticks(fontsize = 12)
plt.subplots_adjust(wspace=0.3, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.51.d Distribution of Trip rentals based on trip type and the year.png', dpi=300, bbox_inches='tight')
Heat Map:
One alternative way of depicting the relationship between two categorical variables is through a Heat map. Heat maps were introduced earlier as the 2-d version of a histogram; here, we're using them as the 2-d version of a bar chart. The seaborn function heatmap is at home with this type of heat map implementation, but the input arguments are unlike most of the visualization functions. Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'trip_type', columns = 'year', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [6, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Aggregated yearly distribution of rentals based on trip type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Trip type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.51.e Heat map of Trip rentals based on trip type and the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on the year:
The
trip_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
year_df = bikeshare.groupby([bikeshare["year"],
bikeshare["trip_type"]]).count()['trip_id'].reset_index(name='rentals')
year_df['rentals'] = year_df['rentals'].fillna(0).astype(int)
year_df.head(10)
plt.figure(figsize=[6, 5])
sb.set_style('white')
flatui = ["#fff480"]
sb.set_palette(flatui, n_colors=1, desat=0.8)
base_color = sb.color_palette()[0]
sb.pointplot(data = year_df, x = "year", y = "rentals", linestyles = ['-', '-'], hue = 'trip_type', ci = None)
plt.title('Average yearly bike rentals categorized by trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nYear', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
yearly_rental_avg_max = 300000
y_tick_values = np.arange(0, yearly_rental_avg_max+50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# add annotations
# -------------------------------------------------------
avg_rentals = year_df.groupby([year_df["trip_type"], year_df["year"]]).mean()['rentals'].reset_index()
avg_rentals_max = avg_rentals.rentals.max()
avg_rentals_base = avg_rentals.query(' trip_type == "One Way" ')
avg_rentals_extended = avg_rentals.query(' trip_type == "Round Trip" ')
# get the current tick locations and labels
locs, labels = plt.xticks()
for categorical_df in [avg_rentals_base, avg_rentals_extended]:
clrs = ['#ff5e8c' if fare == "One Way" else '#5effe4' for fare in categorical_df.trip_type ]
# loop through each pair of locations and labels
for loc, label, avg_rental_count, clr in zip(locs, labels, categorical_df.rentals, clrs):
try:
count = avg_rental_count
except KeyError:
count = 0
pct_string = '{:0.0f} K'.format(count/1000)
indent = 20000
# print the annotation depending on the bar length
plt.text(loc, count + indent, pct_string, ha = 'center', color = 'black',
fontsize = 12, bbox=dict(pad=1.9,alpha=0.2,color='none',fc=clr))
# -------------------------------------------------------
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.4)
# savefig by passing (bbox_inches='tight'),which will adjust dhe figure to include all of the x and y labels
plt.savefig('plots/3.2.51.f Average yearly bike rentals categorized by trip type.png', dpi=300, bbox_inches='tight')
bikeshare.year and bike_type columns:¶Columns: year, bike_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ["darkslateblue", "lightseagreen", "royalblue", "rebeccapurple"]
sb.set_palette(flatui, desat = 0.8)
plt.figure(figsize = [6, 5])
cat1_order = bikeshare.year.sort_values(ascending=True).unique()
cat2_order = bikeshare.bike_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['bike_type'] == cat2]['year'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['year']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated yearly rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.52.a Stack plot of rentals based on bike type and the year.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['bike_type'],
bikeshare['year']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 100000, 100000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data = categorical_counts, x = "year", y = "rentals", hue="bike_type", linewidth=4,
style="bike_type", err_style="bars", markers = ['o', 'o', 'o', 'o'], markersize=10)
ax.lines[0].set_linestyle("-")
ax.lines[1].set_linestyle("-")
ax.lines[2].set_linestyle("-")
ax.lines[3].set_linestyle("-")
plt.title('Aggregated yearly rentals based on bike type\n', weight = 'bold', fontsize = 16)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# set custom xticks to avoid segmentation of continuous values of year (['2017.25', '2017.50', '2017.75, ....'])
# get alloted xticks, if decimal part equals to 0, set it as xtick value else skip the xtick value
xlabels = ['{:.0f}'.format(x) if divmod(x, 1)[1] == 0 else "" for x in ax.get_xticks()]
ax.set_xticklabels(xlabels)
plt.xticks(fontsize=12)
# customize legend
leg = ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
leg_lines = leg.get_lines()
leg_lines[1].set_linestyle("-")
leg_lines[2].set_linestyle("-")
leg_lines[3].set_linestyle("-")
leg_lines[4].set_linestyle("-")
leg.texts[0].set_text("Bike type")
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.52.b Line plot of rentals based on bike type and the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of yearly rentals over bike type:
# set the palette as per requirement
flatui = ["#80324a", "#7b5f8a"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [8, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'bike_type', col_wrap = 2, height = 3, aspect = 1)
g.map(sb.countplot, "year", color = base_color, order = bikeshare.year.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated yearly distribution of bike rentals categorized by bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nYear', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.52.c Facet grid of rentals based on bike type and the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'bike_type', columns = 'year', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [6, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Yearly distribution of bike rentals based on bike type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Bike type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
t.set_text(str(p)+' %')
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.52.d Heat map of rentals based on bike type and the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on the year:
The
bike_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
year_df = bikeshare.groupby([bikeshare["year"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
year_df.head(10)
plt.figure(figsize=[6,5])
sb.set_style('white')
# flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
flatui = ['#ff7ddd', '#77f7cc', '#4b99eb', '#aa75fa']
sb.set_palette(flatui, n_colors=4, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = year_df, x = "year", y = "rentals", linestyles = "-", hue = 'bike_type',
scale = 1, ci = None)
plt.title('Average yearly bike rentals categorized by bike type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nYear', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
yearly_rental_avg_max = 300000
y_tick_values = np.arange(0, yearly_rental_avg_max+50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Bike type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.5)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.52.e Average yearly bike rentals categorized by bike type.png', dpi=300, bbox_inches='tight')
bikeshare.year and pass_type columns:¶Columns: year, pass_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [6, 5])
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
cat1_order = bikeshare.year.sort_values(ascending=True).unique()
cat2_order = bikeshare.pass_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['pass_type'] == cat2]['year'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['year']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated yearly rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.53.a Stack plot of rentals based on pass type and the year.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['pass_type'],
bikeshare['year']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('whitegrid')
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "year", y = "rentals", hue="pass_type", linewidth=3, alpha = 0.8,
style="pass_type", err_style="bars", markers = ['o', 'o', 'o', 'o', 'o'], markersize=10)
ax.lines[0].set_linestyle("-")
ax.lines[1].set_linestyle("-")
ax.lines[2].set_linestyle("-")
ax.lines[3].set_linestyle("-")
ax.lines[4].set_linestyle("-")
plt.title('Aggregated yearly rentals based on pass type\n', weight = 'bold', fontsize = 16)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# set custom xticks to avoid segmentation of continuous values of year (['2017.25', '2017.50', '2017.75, ....'])
# get alloted xticks, if decimal part equals to 0, set it as xtick value else skip the xtick value
xlabels = ['{:.0f}'.format(x) if divmod(x, 1)[1] == 0 else "" for x in ax.get_xticks()]
ax.set_xticklabels(xlabels)
plt.xticks(fontsize=12)
# customize legend
leg = ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1))
leg_lines = leg.get_lines()
leg_lines[1].set_linestyle("-")
leg_lines[2].set_linestyle("-")
leg_lines[3].set_linestyle("-")
leg_lines[4].set_linestyle("-")
leg.texts[0].set_text("Pass type");
# sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.53.b Line plot of rentals based on pass type and the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of yearly rentals over bike type:
# set the palette as per requirement
flatui = ["#80324a", "#7b5f8a"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [18, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'pass_type', col_wrap = 3, height = 3.5, aspect = 1)
g.map(sb.countplot, "year", color = base_color, order = bikeshare.year.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Aggregated yearly distribution of bike rentals categorized by pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nYear', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.53.c Facet grid of rentals based on pass type and the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'pass_type', columns = 'year', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [6, 4])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Yearly distribution of bike rentals based on pass type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Pass type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.53.d Heat map of rentals based on pass type and the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on the year:
The
pass_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
year_df = bikeshare.groupby([bikeshare["year"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
year_df
plt.figure(figsize=[6,5])
sb.set_style('white')
# flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
flatui = ["#34e0c7", "#c271e3", "#4cb1f5", "#e06458", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = year_df, x = "year", y = "rentals", linestyles = "-", hue = 'pass_type',
scale = 1, ci = None)
plt.title('Average yearly bike rentals categorized by pass type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nYear', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
yearly_rental_avg_max = 300000
y_tick_values = np.arange(0, yearly_rental_avg_max+50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper left', labelspacing=0.5,
title='Pass type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.3)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.53.e Average yearly bike rentals categorized by pass type.png', dpi=300, bbox_inches='tight')
bikeshare.year and fare_type columns:¶Columns: year, fare_typeData type: (Categorical, ordered) and (Categorical, nominal)Plot: Facet Grid, Violin plot, Box plot, Adapted BarchartStack plot:
# Assign palette as per requirement
sb.set_style('white')
plt.figure(figsize = [6, 5])
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
cat1_order = bikeshare.year.sort_values(ascending=True).unique()
cat2_order = bikeshare.fare_type.sort_values(ascending=True).unique()
# plot stacked plot
baselines = np.zeros(len(cat1_order))
# for each second-variable category:
for i in range(len(cat2_order)):
# isolate the counts of the first category,
cat2 = cat2_order[i]
inner_counts = bikeshare[bikeshare['fare_type'] == cat2]['year'].value_counts()
# then plot those counts on top of the accumulated baseline
plt.bar(x = np.arange(len(cat1_order)), height = inner_counts[cat1_order].fillna(0),
bottom = baselines, alpha = 0.8)
baselines += inner_counts[cat1_order].fillna(0)
# improve plot aesthetics
max_count = bikeshare.groupby([bikeshare['year']]).size().reset_index(name='rentals').max()['rentals']
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.title('Aggregated yearly rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.xticks(np.arange(len(cat1_order)), cat1_order, fontsize=12)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
plt.legend(cat2_order, scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.54.a Stack plot of rentals based on fare type and the year.png', dpi=300, bbox_inches='tight')
Line plot:
The data needs some summirization by grouping together the respective variables. Then, reset the index and name the summerized data values accordingly.
categorical_counts = bikeshare.groupby([bikeshare['fare_type'],
bikeshare['year']]).size().reset_index(name='rentals')
categorical_counts.head(10)
# set the palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
plt.figure(figsize = [6, 4])
max_count = categorical_counts.rentals.max()
y_tick_values = np.arange(0, max_count + 50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
# plot line plot
ax = sb.lineplot(data=categorical_counts, x = "year", y = "rentals", hue="fare_type", linewidth=3, alpha = 0.8)
plt.title('Aggregated yearly rentals based on fare type\n', weight = 'bold', fontsize = 16)
plt.yticks(y_tick_values, y_tick_names, fontsize=12)
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Rentals (Thousands)\n', fontsize=14)
# set custom xticks to avoid segmentation of continuous values of year (['2017.25', '2017.50', '2017.75, ....'])
# get alloted xticks, if decimal part equals to 0, set it as xtick value else skip the xtick value
xlabels = ['{:.0f}'.format(x) if divmod(x, 1)[1] == 0 else "" for x in ax.get_xticks()]
ax.set_xticklabels(xlabels)
plt.xticks(fontsize=12)
# when 'hue' is used, its label appears as legend title
# replace default title with custom title.
ax.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.35, 1)).texts[0].set_text("Fare type")
sb.despine();
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.54.b Line plot of rentals based on fare type and the year.png', dpi=300, bbox_inches='tight')
Individual plots of aggregated distribution of yearly rentals over bike type:
# set the palette as per requirement
flatui = ["#80324a", "#7b5f8a"]
sb.set_palette(flatui, desat = 0.8)
base_color = sb.color_palette()[0]
sb.set_style('white')
plt.figure(figsize = [12, 4])
# plot facet grid
g = sb.FacetGrid(data = bikeshare, col = 'fare_type', col_wrap = 2, height = 3.5, aspect = 1)
g.map(sb.countplot, "year", color = base_color, order = bikeshare.year.sort_values(ascending=True).unique())
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Aggregated yearly distribution of bike rentals categorized by fare type\n', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# obtain y_ticks and convert them to a multiple of millions
# obtain the x_ticks and store them as a list
y_tick_names = []
x_tick_names = []
for ax in g.axes.flat:
for y_label in ax.get_yticklabels():
y_label_value = int(y_label.get_text())
y_label_new_value = '{:0.0f} K'.format(y_label_value/1000)
y_tick_names.append(y_label_new_value)
for x_label in ax.get_xticklabels():
x_label_value = x_label.get_text()
x_tick_names.append(x_label_value)
g.set_yticklabels(y_tick_names, size = 12)
g.set_xticklabels(x_tick_names, size = 12)
g.set_xlabels('\nYear', size = 14)
g.set_ylabels('Rentals (thoudands)\n', size = 14)
plt.subplots_adjust(wspace=0.2, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.54.c Facet grid of rentals based on fare type and the year.png', dpi=300, bbox_inches='tight')
Let us plot more variety of graphs to observe the distribution of data and hidden insights.
Heat Map:
Now lets make a pivot from the categorical dataset created earlier into a more appropriate data structure.
categorical_counts = categorical_counts.pivot(index = 'fare_type', columns = 'year', values = 'rentals')
categorical_counts
The data is ready to be plotred as the
Heat map.
# plot heat map
plt.figure(figsize = [6, 2])
res = sb.heatmap(categorical_counts, annot = True, fmt = '.0f', annot_kws={'size':12}, linewidths=0.1, cmap="YlGnBu")
plt.yticks(rotation=0, fontsize="12", va="center")
plt.xticks(fontsize="12")
plt.title('Yearly distribution of bike rentals based on fare type\n', fontsize = 16, weight = 'bold')
plt.xlabel('\nYear', fontsize=14)
plt.ylabel('Fare type\n', fontsize=14);
# convert annotations to percentages
for t in res.texts:
p = np.round(np.round(int(t.get_text())/bikeshare.shape[0], 4)*100, 1)
if p < 0.1:
t.set_text('< 0.1%')
else:
t.set_text(str(p)+' %');
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.54.d Heat map of rentals based on fare type and the year.png', dpi=300, bbox_inches='tight')
Find average rentals based on the year:
The
fare_typeneeds to be converted into categorical datatype, otherwise thegroupbymethod will ignoreNaNrental values and inturn produce incorrectavg. rental value. Also use ofsize()method ignores the unused level combinations of the groups, hence usecount()method.
year_df = bikeshare.groupby([bikeshare["year"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
year_df
plt.figure(figsize=[6, 5])
sb.set_style('white')
flatui = ['#577da1', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
base_color = sb.color_palette()[0]
sb.pointplot(data = year_df, x = "year", y = "rentals", linestyles = "-", hue = 'fare_type',
scale = 1, ci = None)
plt.title('Average yearly bike rentals categorized by fare type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nYear', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
yearly_rental_avg_max = 300000
y_tick_values = np.arange(0, yearly_rental_avg_max+50000, 50000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, loc = 'upper right', labelspacing=0.5,
title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1.3, 1))
sb.despine(top=True, right=True, left=False, bottom=False);
for loc in y_tick_values:
plt.axhline(loc, ls='--', color='grey', linewidth=0.5, alpha=0.3)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.54.e Average yearly bike rentals categorized by fare type.png', dpi=300, bbox_inches='tight')
duration_min and distance_miles columns:¶Columns: duration_min, distance_milesData type: (Numerical, continuous) and (Numerical, continuous)Plot: Scatter plotScatter plot:
# assign color palette
sb.set_palette('deep', n_colors=4, desat=0.8)
current_palette = sb.color_palette()
sb.set_style('white')
# plot seaborns regplot
sb.regplot(data = bikeshare, y = 'duration_min', x = 'distance_miles');
# improve plot aesthetics
plt.title('Scatter plot of trip distances and durations\n', weight = 'bold', fontsize = 16)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nDistance (miles)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.55.a Scatter plot of trip distances and durations.png', dpi=300, bbox_inches='tight')
Add transparency to the scatter plot for clear interpretation.
# assign color palette
sb.set_palette('deep', n_colors=4, desat=0.8)
current_palette = sb.color_palette()
sb.set_style('white')
# plot seaborns regplot with transparency
sb.regplot(data = bikeshare, y = 'duration_min', x = 'distance_miles', scatter_kws = {'alpha' : 1/10});
# improve plot aesthetics
plt.title('Scatter plot of trip distances and durations\n', weight = 'bold', fontsize = 16)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nDistance (miles)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.55.b Scatter plot of trip distances and durations.png', dpi=300, bbox_inches='tight')
Remove round trips and convert the data into logarithmic transformation as the data is skewed right.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
# assign color palette
sb.set_palette('deep', n_colors=4, desat=0.8)
current_palette = sb.color_palette()
sb.set_style('white')
# plot seaborns regplot
df = bikeshare.query(' duration_min != 0 ').copy()
sb.regplot(df['distance_miles'], df['duration_min'].apply(log_trans),
fit_reg = False, scatter_kws = {'alpha' : 1/10})
# improve plot aesthetics
plt.title('Logarithmic transformation of trip distances and durations\n', weight = 'bold', fontsize = 16)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nDistance (miles)', fontsize = 14)
plt.xticks(fontsize = 12)
tick_locs = [1, 10, 100, 1000, 10000]
plt.yticks(log_trans(tick_locs), tick_locs, fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.2.55.c Logarithmic transformation of trip distances and durations.png', dpi=300, bbox_inches='tight')
duration_min, distance_miles, and fare columns:¶Columns: duration_min, distance_miles, fareData type: (Numerical, continuous) and (Numerical, continuous)Plot: Scatter plotLogarithmic transformation of trip durations and analysis of trip distances and trip fares.
Scatter plot:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
# assign color palette
sb.set_palette('deep', n_colors=4, desat=0.8)
current_palette = sb.color_palette()
sb.set_style('white')
# plot seaborns regplot
df = bikeshare.query(' duration_min != 0 ').copy()
# sb.regplot(df['distance_miles'], df['duration_min'].apply(log_trans),
# fit_reg = False, scatter_kws = {'alpha' : 1/10})
plt.scatter(data = temp_df, x = 'distance_miles', y = 'duration_log', s = 'fare')
# improve plot aesthetics
plt.title('Logarithmic trip distances, durations and fare\n', weight = 'bold', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nDistance (miles)', fontsize = 14)
plt.xticks(fontsize = 12)
tick_locs = [1, 10, 100, 1000, 10000]
plt.yticks(log_trans(tick_locs), tick_locs, fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.1 Logarithmic transformation of trip distances, durations and fares.png', dpi=300, bbox_inches='tight')
duration_min, distance_miles, trip_type and fare columns:¶Columns: duration_min, distance_miles, fare, trip_typeData type: (Numerical, continuous) and (categorical ,ordered)Plot: Scatter plotLogarithmic transformation of trip durations and analysis of trip distances and trip fares over trip type.
Scatter plot:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
# assign color palette
sb.set_palette('deep', n_colors=4, desat=0.8)
current_palette = sb.color_palette()
sb.set_style('white')
# plot seaborns regplot
cat_markers = [['One Way', 's'],
['Round Trip', 'o']]
for cat, marker in cat_markers:
df_cat = temp_df[temp_df['trip_type'] == cat]
plt.scatter(data = df_cat, x = 'distance_miles', y = 'duration_log', s = 'fare', alpha = 0.5, marker = marker)
plt.legend(['One Way','Round Trip'])
# improve plot aesthetics
plt.title('Logarithmic trip distances, durations, fares over trip type\n', weight = 'bold', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nDistance (miles)', fontsize = 14)
plt.xticks(fontsize = 12)
tick_locs = [1, 10, 100, 1000, 10000]
plt.yticks(log_trans(tick_locs), tick_locs, fontsize = 12);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.2 Logarithmic transformation of trip distances, durations, fares over trip type.png', dpi=300, bbox_inches='tight')
duration_min, distance_miles, and trip_type columns:¶Columns: duration_min, distance_milesData type: (Numerical, continuous) and (Numerical, continuous)Plot: Scatter plotLogarithmic transformation of trip durations and analysis of trip distances over trip type.
Hexbin plot:
def hexbin(x, y, color, **kwargs):
cmap = sb.light_palette(color, as_cmap=True)
plt.hexbin(x, y, gridsize=30, cmap=cmap, **kwargs)
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
with sb.axes_style("dark"):
g = sb.FacetGrid(temp_df, hue="trip_type", col="trip_type", col_wrap = 2, height=3.5)
g.map(hexbin, "distance_miles", "duration_log");
# improve plot aesthetics
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Trip durations and distances over trip type\n', fontsize = 14, weight = 'bold')
g.set_titles('Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.3.a Logarithmic transformation of trip distances, durations over trip type.png', dpi=300, bbox_inches='tight')
Limit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
def hexbin(x, y, color, **kwargs):
cmap = sb.light_palette(color, as_cmap=True)
plt.hexbin(x, y, gridsize=30, cmap=cmap, **kwargs)
temp_df = bikeshare.query(' duration_min <= 120 and distance_miles <= 3 ').copy()
with sb.axes_style("dark"):
g = sb.FacetGrid(temp_df, hue="trip_type", col="trip_type", col_wrap = 2, height=3.5)
g.map(hexbin, "distance_miles", "duration_min", extent=[0, 3, 0, 120]);
# improve plot aesthetics
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Subset Trip durations and distances over trip type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.3.b Subset of trip distances, durations over trip type.png', dpi=300, bbox_inches='tight')
duration_min, distance_miles, and bike_type columns:¶Columns: duration_min, distance_milesData type: (Numerical, continuous) and (Numerical, continuous)Plot: Scatter plotLogarithmic transformation of trip durations and analysis of trip distances over bike type.
Hexbin plot:
def hexbin(x, y, color, **kwargs):
cmap = sb.light_palette(color, as_cmap=True)
plt.hexbin(x, y, gridsize=30, cmap=cmap, **kwargs)
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
with sb.axes_style("dark"):
g = sb.FacetGrid(temp_df, hue="bike_type", col="bike_type", col_wrap = 2, height=3.5)
g.map(hexbin, "distance_miles", "duration_log");
# improve plot aesthetics
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Trip durations and distances over bike type\n', fontsize = 16, weight = 'bold')
g.set_titles('Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.4.a Logarithmic transformation of trip distances, durations over bike type.png', dpi=300, bbox_inches='tight')
Limit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
def hexbin(x, y, color, **kwargs):
cmap = sb.light_palette(color, as_cmap=True)
plt.hexbin(x, y, gridsize=30, cmap=cmap, **kwargs)
temp_df = bikeshare.query(' duration_min <= 120 and distance_miles <= 3 ').copy()
with sb.axes_style("dark"):
g = sb.FacetGrid(temp_df, hue="bike_type", col="bike_type", col_wrap = 2, height=3.5)
g.map(hexbin, "distance_miles", "duration_min", extent=[0, 3, 0, 120]);
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Subset Trip durations and distances over bike type\n', fontsize = 14, weight = 'bold')
g.set_titles('Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.4.b Subset of trip distances, durations over bike type.png', dpi=300, bbox_inches='tight')
duration_min, distance_miles, and pass_type columns:¶Columns: duration_min, distance_milesData type: (Numerical, continuous) and (Numerical, continuous)Plot: Scatter plotLogarithmic transformation of trip durations and analysis of trip distances over pass type.
Hexbin plot:
def hexbin(x, y, color, **kwargs):
cmap = sb.light_palette(color, as_cmap=True)
plt.hexbin(x, y, gridsize=30, cmap=cmap, **kwargs)
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
with sb.axes_style("dark"):
g = sb.FacetGrid(temp_df, hue="pass_type", col="pass_type", col_wrap = 3, height=3.5)
g.map(hexbin, "distance_miles", "duration_log");
# improve plot aesthetics
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Trip durations and distances over pass type\n', fontsize = 16, weight = 'bold')
g.set_titles('Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.5.a Logarithmic transformation of trip distances, durations over pass type.png', dpi=300, bbox_inches='tight')
Limit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
def hexbin(x, y, color, **kwargs):
cmap = sb.light_palette(color, as_cmap=True)
plt.hexbin(x, y, gridsize=30, cmap=cmap, **kwargs)
temp_df = bikeshare.query(' duration_min <= 120 and distance_miles <= 3 ').copy()
with sb.axes_style("dark"):
g = sb.FacetGrid(temp_df, hue="pass_type", col="pass_type", col_wrap = 3, height=3.5)
g.map(hexbin, "distance_miles", "duration_min", extent=[0, 3, 0, 120]);
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Subset Trip durations and distances over pass type\n', fontsize = 14, weight = 'bold')
g.set_titles('Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.5.b Subset of trip distances, durations over pass type.png', dpi=300, bbox_inches='tight')
duration_min, distance_miles, and fare_type columns:¶Columns: duration_min, distance_milesData type: (Numerical, continuous) and (Numerical, continuous)Plot: Scatter plotLogarithmic transformation of trip durations and analysis of trip distances over fare type.
Hexbin plot:
def hexbin(x, y, color, **kwargs):
cmap = sb.light_palette(color, as_cmap=True)
plt.hexbin(x, y, gridsize=30, cmap=cmap, **kwargs)
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
with sb.axes_style("dark"):
g = sb.FacetGrid(temp_df, hue="fare_type", col="fare_type", col_wrap = 2, height=3.5)
g.map(hexbin, "distance_miles", "duration_log");
# improve plot aesthetics
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Trip durations and distances over fare type\n', fontsize = 14, weight = 'bold')
g.set_titles('Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.6.a Logarithmic transformation of trip distances, durations over fare type.png', dpi=300, bbox_inches='tight')
Limit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
def hexbin(x, y, color, **kwargs):
cmap = sb.light_palette(color, as_cmap=True)
plt.hexbin(x, y, gridsize=30, cmap=cmap, **kwargs)
temp_df = bikeshare.query(' duration_min <= 120 and distance_miles <= 3 ').copy()
with sb.axes_style("dark"):
g = sb.FacetGrid(temp_df, hue="fare_type", col="fare_type", col_wrap = 2, height=3.5)
g.map(hexbin, "distance_miles", "duration_min", extent=[0, 3, 0, 120]);
# improve plot aesthetics
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Subset Trip durations and distances over fare type\n', fontsize = 14, weight = 'bold')
g.set_titles('Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.6.b Subset of trip distances, durations over fare type.png', dpi=300, bbox_inches='tight')
duration_min,distance_miles, and trip_type columns:¶Columns: duration_min, distance_miles, trip_typeData type: (Numerical, continuous), (Numerical, continuous), (categorical, nominal)Plot: Facet grid, Scatter plotlmplot:
# assign color palette
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
sb.set_style('white')
# plot seaborns lmplot with transparency
g = sb.lmplot(data = bikeshare, x = 'distance_miles', y = 'duration_min', scatter_kws = {'alpha' : 1/10},
legend = True, legend_out=True, hue = 'trip_type');
# improve plot aesthetics
plt.title('Trip distances and durations over trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nDistance (miles)', fontsize = 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
# add custom legend
g._legend.set_title("Bike Type")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.7.a Trip distances and durations over trip type.png', dpi=300, bbox_inches='tight')
Logarithmic transformation of trip durations and analysis of trip distances and trip fares over trip type.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
# assign color palette
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
sb.set_style('white')
# plot seaborns lmplot with transparency
df = bikeshare.query(' duration_min != 0 ').copy()
df['duration_log'] = df['duration_min'].apply(log_trans)
g = sb.lmplot(data = df, x = 'distance_miles', y = 'duration_log', scatter_kws = {'alpha' : 1/10},
legend = True, legend_out=True, hue = 'trip_type');
# improve plot aesthetics
plt.title('Logarithmic Trip distances and durations over trip type\n', weight = 'bold', fontsize = 16)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nDistance (miles)', fontsize = 14)
plt.xticks(fontsize = 12)
tick_locs = [1, 10, 100, 1000, 10000]
plt.yticks(log_trans(tick_locs), tick_locs);
# add custom legend
g._legend.set_title("Bike Type")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.7.b Logarithmic Trip distances and durations over trip type.png', dpi=300, bbox_inches='tight')
Facet grid:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.regplot, 'distance_miles', 'duration_log', fit_reg = False, scatter_kws = {'alpha' : 1/10})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Facet grid of bike rentals over trip type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
g.add_legend()
# add custom legend
g._legend.set_title("Trip Type")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.7.c Facet grid of Trip distances and durations over trip type.png', dpi=300, bbox_inches='tight')
Plot a regression line for analysis purpose:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.regplot, 'distance_miles', 'duration_log', fit_reg = True, scatter_kws = {'alpha' : 1/10})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Facet grid of bike rentals over trip type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
g.add_legend()
# add custom legend
g._legend.set_title("Trip Type")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.7.d Facet grid of Trip distances and durations over trip type.png', dpi=300, bbox_inches='tight')
duration_min,distance_miles, and bike_type columns:¶Columns: duration_min, distance_miles, bike_typeData type: (Numerical, continuous), (Numerical, continuous), (categorical, nominal)Plot: Facet grid, Scatter plotlmplot:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.8)
df = bikeshare.query(' duration_min != 0 ').copy()
df['duration_log'] = df['duration_min'].apply(log_trans)
g = sb.lmplot(data = df, x = 'distance_miles', y = 'duration_log', legend = True, legend_out=True,
fit_reg = False, scatter_kws = {'alpha' : 1/10}, hue = 'bike_type')
# improve plot aesthetics
plt.title('Distribution of bike rentals over bike type\n', weight = 'bold', fontsize = 14)
plt.ylabel('Duration (minutes)\n', fontsize = 14)
plt.xlabel('\nDistance (miles)', fontsize = 14)
plt.xticks(fontsize = 12)
tick_locs = [1, 10, 100, 1000, 10000]
plt.yticks(log_trans(tick_locs), tick_locs, fontsize = 12);
# add custom legend
g._legend.set_title("Bike Type")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.8.a Logarithmic Trip distances and durations over bike type.png', dpi=300, bbox_inches='tight')
The data related to each bike category are overlapped on each other and difficult to interpret. Plot the distribution of each bike type on an individual plot for better interpretation.
Facet grid:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'bike_type', col_wrap = 2, height = 3, aspect = 1, hue = 'bike_type')
g.map(sb.regplot, 'distance_miles', 'duration_log', fit_reg = False, scatter_kws = {'alpha' : 1/10})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over bike type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.8.b Facet grid of Trip distances and durations over bike type.png', dpi=300, bbox_inches='tight')
Limit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'bike_type', col_wrap = 2, height = 3, aspect = 1, hue = 'bike_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = False, scatter_kws = {'alpha' : 1/100})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Subset distribution of bike rentals over bike type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.8.c Subset Facet grid of Trip distances and durations over bike type.png', dpi=300, bbox_inches='tight')
Plot a regression line for analysis purpose:
flatui = ['#ff91e2', '#91ffda', '#60acfc', '#bd91ff']
sb.set_palette(flatui, n_colors=4, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'bike_type', col_wrap = 2, height = 3, aspect = 1, hue = 'bike_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = True, scatter_kws = {'alpha' : 1/100},
line_kws = {'color' : 'grey'})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over bike type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.8.d Subset Facet grid of Trip distances and durations over bike type with regression.png', dpi=300, bbox_inches='tight')
duration_min,distance_miles, and pass_type columns:¶Columns: duration_min, distance_miles, pass_typeData type: (Numerical, continuous), (Numerical, continuous), (categorical, nominal)Plot: Facet grid, Scatter plotFacet grid:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1, hue = 'pass_type')
g.map(sb.regplot, 'distance_miles', 'duration_log', fit_reg = False, scatter_kws = {'alpha' : 1/10})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over pass type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.9.a Facet grid of Trip distances and durations over pass type.png', dpi=300, bbox_inches='tight')
Limit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.8)
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1, hue = 'pass_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = False, scatter_kws = {'alpha' : 1/100})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over pass type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.9.b Subset Facet grid of Trip distances and durations over pass type.png', dpi=300, bbox_inches='tight')
Plot a regression line for analysis purpose:
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.8)
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1, hue = 'pass_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = True, scatter_kws = {'alpha' : 1/100},
line_kws = {'color' : 'grey'})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over pass type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.9.c Subset Facet grid of Trip distances and durations over pass type with regression.png', dpi=300, bbox_inches='tight')
duration_min,distance_miles, and fare_type columns:¶Columns: duration_min, distance_miles, fare_typeData type: (Numerical, continuous), (Numerical, continuous), (categorical, nominal)Plot: Facet grid, Scatter plotlmplot:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
# plot facet grid
g = sb.lmplot(x='distance_miles', y='duration_log', hue='trip_type', col='fare_type', data=temp_df);
# improve plot aesthetics
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Distribution of bike rentals over trip type and segmented by fare type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
# add legend
g.add_legend()
# g._legend.set_title("Trip Type")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.10 Facet grid of Trip distances and durations over fare type.png', dpi=300, bbox_inches='tight')
duration_min,distance_miles, bike_type and trip_type columns:¶Columns: duration_min, distance_miles, bike_type, trip_typeData type: (Numerical, continuous), (Numerical, continuous), (categorical, nominal), (categorical, nominal)Plot: Facet grid, Scatter plotLimit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
lmplot:
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'bike_type', col_wrap = 2, height = 3, aspect = 1, hue = 'trip_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = True, scatter_kws = {'alpha' : 1/100})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over bike type and trip type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.11 Facet grid of Trip distances and durations over bike type and trip type.png', dpi=300, bbox_inches='tight')
duration_min,distance_miles, pass_type and trip_type columns:¶Columns: duration_min, distance_miles, pass_type, trip_typeData type: (Numerical, continuous), (Numerical, continuous), (categorical, nominal), (categorical, nominal)Plot: Facet grid, Scatter plotLimit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
lmplot:
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1, hue = 'trip_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = True, scatter_kws = {'alpha' : 1/100})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over pass type and trip type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.12 Facet grid of Trip distances and durations over pass type and trip type.png', dpi=300, bbox_inches='tight')
duration_min,distance_miles, bike_type and pass_type columns:¶Columns: duration_min, distance_miles, bike_type, pass_typeData type: (Numerical, continuous), (Numerical, continuous), (categorical, nominal), (categorical, nominal)Plot: Facet grid, Scatter plotLimit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
lmplot:
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.8)
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'bike_type', col_wrap = 2, height = 3, aspect = 1, hue = 'pass_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = False, scatter_kws = {'alpha' : 1/100})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over bike type and pass type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.13.a Facet grid of Trip distances and durations over bike type and pass type.png', dpi=300, bbox_inches='tight')
Plot a regression line for analysis purpose:
flatui = ["#26bda7", "#9b59b6", "#3498db", "#e74c3c", "#34495e"]
sb.set_palette(flatui, n_colors=5, desat=0.8)
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'bike_type', col_wrap = 2, height = 3, aspect = 1, hue = 'pass_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = True, scatter_kws = {'alpha' : 1/100})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over bike type and pass type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.13.b Facet grid of Trip distances and durations over bike type and pass type with regression.png', dpi=300, bbox_inches='tight')
duration_min,distance_miles, bike_type and fare_type columns:¶Columns: duration_min, distance_miles, bike_type, fare_typeData type: (Numerical, continuous), (Numerical, continuous), (categorical, nominal), (categorical, nominal)Plot: Facet grid, Scatter plotLimit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
lmplot:
flatui = ["#e74c9c", "#26bd97"]
sb.set_palette(flatui, n_colors=2, desat=0.8)
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'bike_type', col_wrap = 2, height = 3, aspect = 1, hue = 'fare_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = False, scatter_kws = {'alpha' : 1/100})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over bike type and fare type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.14.a Facet grid of Trip distances and durations over bike type and fare type.png', dpi=300, bbox_inches='tight')
Plot a regression line for analysis purpose:
flatui = ["#e74c9c", "#26bd97"]
sb.set_palette(flatui, n_colors=2, desat=0.8)
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'bike_type', col_wrap = 2, height = 3, aspect = 1, hue = 'fare_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = True, scatter_kws = {'alpha' : 1/100} )
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over bike type and fare type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.14.b Facet grid of Trip distances and durations over bike type and fare type with regression.png', dpi=300, bbox_inches='tight')
duration_min,distance_miles, pass_type and fare_type columns:¶Columns: duration_min, distance_miles, pass_type, fare_typeData type: (Numerical, continuous), (Numerical, continuous), (categorical, nominal), (categorical, nominal)Plot: Facet grid, Scatter plotLimit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
lmplot:
flatui = ["#e74c9c", "#26bd97"]
sb.set_palette(flatui, n_colors=2, desat=0.8)
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'pass_type', col_wrap = 3, height = 3, aspect = 1, hue = 'fare_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = False, scatter_kws = {'alpha' : 1/100})
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over pass type and fare type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.15.a Facet grid of Trip distances and durations over pass type and fare type.png', dpi=300, bbox_inches='tight')
Plot a regression line for analysis purpose:
flatui = ["#e74c9c", "#26bd97"]
sb.set_palette(flatui, n_colors=2, desat=0.8)
temp_df = bikeshare.query(' duration_min != 0 and duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.FacetGrid(data = temp_df, col = 'pass_type', col_wrap = 2, height = 3, aspect = 1, hue = 'fare_type')
g.map(sb.regplot, 'distance_miles', 'duration_min', fit_reg = True, scatter_kws = {'alpha' : 1/100} )
# improve plot aesthetics
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Distribution of bike rentals over pass type and fare type\n', fontsize = 14, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 12, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 12)
g.set_ylabels('Duration (minutes)\n', size = 12)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 10)
g.set_xticklabels(x_tick_locs, size = 10)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.15.b Facet grid of Trip distances and durations over pass type and fare type with regression.png', dpi=300, bbox_inches='tight')
duration_min,distance_miles, pass_type, bike_type and trip_type columns:¶Columns: duration_min, distance_miles, pass_type, fare_typeData type: (Numerical, continuous), (Numerical, continuous), (categorical, nominal), (categorical, nominal)Plot: Facet grid, Scatter plotlmplot:
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
# plot facet grid
g = sb.lmplot(x='distance_miles', y='duration_log', fit_reg=False, scatter_kws = {'alpha' : 1/10},
hue='trip_type', col='bike_type', row='pass_type', data=temp_df, legend=False);
# improve plot aesthetics
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Distribution of bike rentals over multiple categories\n', fontsize = 24, weight = 'bold')
g.set_titles('Bike = {col_name} | Pass = {row_name}', weight = 'bold', size = 18, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 18)
g.set_ylabels('Duration (minutes)\n', size = 18)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 16)
g.set_xticklabels(x_tick_locs, size = 16)
# add legend
g.add_legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
title='Trip type', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(0.55, 0.96))
# g._legend.set_title("Trip Type")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.16.a Facet grid of Trip distances and durations over pass type, bike type and trip type.png', dpi=300, bbox_inches='tight')
Add more tranparency to identify the clusters of data.
def log_trans(x, inverse = False):
if not inverse:
return np.log10(x)
else:
return np.power(10, x)
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min != 0 ').copy()
temp_df['duration_log'] = temp_df['duration_min'].apply(log_trans)
# plot facet grid
g = sb.lmplot(x='distance_miles', y='duration_log', fit_reg=False, scatter_kws = {'alpha' : 1/100},
hue='trip_type', col='bike_type', row='pass_type', data=temp_df, legend=False);
# improve plot aesthetics
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Distribution of bike rentals over multiple categories\n', fontsize = 24, weight = 'bold')
g.set_titles('Bike = {col_name} | Pass = {row_name}', weight = 'bold', size = 18, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 18)
g.set_ylabels('Duration (minutes)\n', size = 18)
x_tick_locs = [0, 5, 10, 15, 20, 25]
y_tick_locs = [1, 10, 100, 1000, 10000]
for ax in g.axes.flat:
plt.yticks(log_trans(y_tick_locs), y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 16)
g.set_xticklabels(x_tick_locs, size = 16)
# add legend
g.add_legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
title='Trip type', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(0.55, 0.96))
# g._legend.set_title("Trip Type")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.16.b Facet grid of Trip distances and durations over pass type, bike type and trip type.png', dpi=300, bbox_inches='tight')
Limit the dataset to trip durations under 120 minutes and distances under 3 miles for closer observation.
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.lmplot(x='distance_miles', y='duration_min', fit_reg=False, scatter_kws = {'alpha' : 1/10},
hue='trip_type', col='bike_type', row='pass_type', data=temp_df, legend=False);
# improve plot aesthetics
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Distribution of bike rentals over multiple categories\n', fontsize = 24, weight = 'bold')
g.set_titles('Bike = {col_name} | Pass = {row_name}', weight = 'bold', size = 18, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 18)
g.set_ylabels('Duration (minutes)\n', size = 18)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 16)
g.set_xticklabels(x_tick_locs, size = 16)
# add legend
g.add_legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
title='Trip type', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(0.55, 0.96))
# g._legend.set_title("Trip Type")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.16.c Subset Facet grid of Trip distances and durations over pass type, bike type and trip type.png', dpi=300, bbox_inches='tight')
Add more tranparency to identify the clusters of data.
sb.set_palette('deep', n_colors=2, desat=0.8)
current_palette = sb.color_palette()
temp_df = bikeshare.query(' duration_min <= 120 and distance_miles <= 3 ').copy()
# plot facet grid
g = sb.lmplot(x='distance_miles', y='duration_min', fit_reg=False, scatter_kws = {'alpha' : 1/100},
hue='trip_type', col='bike_type', row='pass_type', data=temp_df, legend=False);
# improve plot aesthetics
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Distribution of bike rentals over multiple categories\n', fontsize = 24, weight = 'bold')
g.set_titles('Bike = {col_name} | Pass = {row_name}', weight = 'bold', size = 18, color = 'dimgrey')
g.set_xlabels('\nDistance (miles)', size = 18)
g.set_ylabels('Duration (minutes)\n', size = 18)
x_tick_locs = np.arange(0, 3+1, 1)
y_tick_locs = np.arange(0, 120+20, 20)
for ax in g.axes.flat:
plt.yticks(y_tick_locs, y_tick_locs)
plt.xticks(x_tick_locs, x_tick_locs)
g.set_yticklabels(y_tick_locs, size = 16)
g.set_xticklabels(x_tick_locs, size = 16)
# add legend
g.add_legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1,
framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
title='Trip type', title_fontsize=14, fontsize=12, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(0.55, 0.96))
# g._legend.set_title("Trip Type")
for lh in g._legend.legendHandles:
lh.set_alpha(1)
lh._sizes = [50]
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.16.d Subset Facet grid of Trip distances and durations over pass type, bike type and trip type.png', dpi=300, bbox_inches='tight')
hour and year columns:¶Columns: hour, yearData type: (Numerical, continuous) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour of the day over years:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude all hours in every day of the month over individual years. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"]]).count()['trip_id'].reset_index(name='rentals')
hours_df['rentals'] = hours_df['rentals'].fillna(0).astype(int)
hours_df.head(10)
Point plot:
# Assign color palette and figure size as per requirement
plt.figure(figsize=[12,4])
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
sb.pointplot(data = hours_df, x = "hour", y = "rentals", linestyles = "-",
hue = 'year', ci = None, order = plot_order)
# improve plot aesthetics
plt.title('Average bike rentals based on hour of the day\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. rentals\n', fontsize = 14)
plt.xlabel('\nHour of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+10, 10)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Year', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.17 Average bike rentals based on hour of the day over years.png', dpi=300, bbox_inches='tight')
daytime and year columns:¶Columns: daytime, yearData type: (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude all daytimes in every day of the month over individual years. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each daytime in a day
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df['rentals'] = daytime_df['rentals'].fillna(0).astype(int)
daytime_df.head(10)
Point plot:
# Assign color palette and figure size as per requirement
plt.figure(figsize=[6,4])
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
sb.pointplot(data = daytime_df, x = "daytime", y = "rentals", linestyles = "-",
hue = 'year', ci = None, order = plot_order)
# improve plot aesthetics
plt.title('Average bike rentals based on time of the day\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nTime of the day', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+50, 50)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Year', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.18 Average bike rentals based on daytime of the day over years.png', dpi=300, bbox_inches='tight')
weekday and year columns:¶Columns: weekday, yearData type: (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the weekday of each week in a month over years:
Create a dataset which contain bike rentals relative to weekday of each week in the month over respective years. Care should be taken as to inlcude all weekdays in every week of the month over individual years. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to every weekday in any week.
# create a dataset for bike rentals over each weekday in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df['rentals'] = weekday_df['rentals'].fillna(0).astype(int)
weekday_df.head(10)
Point plot:
# Assign color palette and figure size as per requirement
plt.figure(figsize=[8,5])
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
sb.pointplot(data = weekday_df, x = "weekday", y = "rentals", linestyles = "-",
hue = 'year', ci = None, order = plot_order)
# improve plot aesthetics
plt.title('Average bike rentals based on weekday of the week\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the week', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Year', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.19 Average bike rentals based on day of the week over years.png', dpi=300, bbox_inches='tight')
day and year columns:¶Columns: day, yearData type: (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to day of each month over respective years. Care should be taken as to inlcude only the days related to each month over individual years. Use only available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to respective day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign color palette and figure size as per requirement
plt.figure(figsize=[14,5])
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
sb.pointplot(data = days_df, x = "day", y = "rentals", linestyles = "-",
hue = 'year', ci = None, order = plot_order)
# improve plot aesthetics
plt.title('Average bike rentals based on day of the month\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals\n', fontsize = 14)
plt.xlabel('\nDay of the month', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+100, 100)
y_tick_names = ['{:0.0f}'.format(v) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Year', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.20 Average bike rentals based on day of the month over years.png', dpi=300, bbox_inches='tight')
month and year columns:¶Columns: month, yearData type: (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude all the rentals related to each month over individual years. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to respective month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"]]).count()['trip_id'].reset_index(name='rentals')
month_df['rentals'] = month_df['rentals'].fillna(0).astype(int)
month_df.head(10)
Point plot:
# Assign color palette and figure size as per requirement
plt.figure(figsize=[8,5])
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
sb.pointplot(data = month_df, x = "month", y = "rentals", linestyles = "-",
hue = 'year', ci = None, order = plot_order)
# improve plot aesthetics
plt.title('Average bike rentals based on month of the year\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals (thousands)\n', fontsize = 14)
plt.xlabel('\nMonth of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+5000, 5000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Year', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.21 Average bike rentals based on the month over years.png', dpi=300, bbox_inches='tight')
quarter and year columns:¶Columns: quarter, yearData type: (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter of the year:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude all the rentals related to each quarter over individual years. Use all categorical combinations and fill the NULL values with numerical zero's so as to consider bike rentals subjected to respective quarter in any year.
# create a dataset for bike rentals over each quarter in a year
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df['rentals'] = quarter_df['rentals'].fillna(0).astype(int)
quarter_df.head(10)
Point plot:
# Assign color palette and figure size as per requirement
plt.figure(figsize=[6,5])
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
sb.pointplot(data = quarter_df, x = "quarter", y = "rentals", linestyles = "-",
hue = 'year', ci = None, order = plot_order)
# improve plot aesthetics
plt.title('Average bike rentals based on quarter of the year\n', weight = 'bold', fontsize = 16)
plt.ylabel('Avg. bike rentals (thousands)\n', fontsize = 14)
plt.xlabel('\nQuarter of the year', fontsize = 14)
plt.xticks(fontsize = 12)
# get ytick locs and rearrage them with respect to zero
locs, labels = plt.yticks()
max_count = locs.max()
y_tick_values = np.arange(0, max_count+10000, 10000)
y_tick_names = ['{:0.0f} K'.format(v/1000) for v in y_tick_values]
plt.yticks(y_tick_values, y_tick_names, fontsize = 12)
# plot legend
plt.legend(scatterpoints=1, frameon=True, fancybox=True, shadow=False, ncol = 1, framealpha = 1,
borderpad=1, borderaxespad=1, bbox_to_anchor = (1, 0.8), loc = 6, labelspacing=0.5,
title='Year', title_fontsize=12, fontsize=10, facecolor='white', markerfirst=True,
handlelength=2, handletextpad=0.5)
sb.despine(top=True, right=True, left=False, bottom=False);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.22 Average bike rentals based on the quarter over years.png', dpi=300, bbox_inches='tight')
hour, year and trip_type columns:¶Columns: hour, year, trip_typeData type: (numerical, continuous), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour of the year:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["trip_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1.5, hue = 'year')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on hour of the day over years by trip type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.23 Average bike rentals based on hour of the day over years by trip type.png', dpi=300, bbox_inches='tight')
daytime, year and trip_type columns:¶Columns: daytime, year, trip_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each daytime in a day
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["trip_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1.2, hue = 'year')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on time of the day over years by trip type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.1, hspace=0.3);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.24 Average bike rentals based on time of the day over years by trip type.png', dpi=300, bbox_inches='tight')
day, year and trip_type columns:¶Columns: day, year, trip_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["trip_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'trip_type', col_wrap = 2, height = 4.5, aspect = 1.8, hue = 'year')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the month over years by trip type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.25 Average bike rentals based on day of the month over years by trip type.png', dpi=300, bbox_inches='tight')
weekday, year and trip_type columns:¶Columns: weekday, year, trip_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over years:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["trip_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'trip_type', col_wrap = 2, height = 5, aspect = 1.3, hue = 'year')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the week over years by trip type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.26 Average bike rentals based on day of the week over years by trip type.png', dpi=300, bbox_inches='tight')
month, year and trip_type columns:¶Columns: month, year, trip_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["trip_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1.2, hue = 'year')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on month over the years by trip type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.27 Average bike rentals based on month over the years by trip type.png', dpi=300, bbox_inches='tight')
quarter, year and trip_type columns:¶Columns: quarter, year, trip_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over years:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each quarter in a year
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["trip_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'trip_type', col_wrap = 2, height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on quarter over the years by trip type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.28 Average bike rentals based on quarter over the years by trip type.png', dpi=300, bbox_inches='tight')
hour, year and bike_type columns:¶Columns: hour, year, bike_typeData type: (numerical, continuous), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour of the year:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1.5, hue = 'year')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on hour of the day over years by bike type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.29 Average bike rentals based on hour of the day over years by bike type.png', dpi=300, bbox_inches='tight')
daytime, year and bike_type columns:¶Columns: daytime, year, bike_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each daytime in a day
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1.2, hue = 'year')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on time of the day over years by bike type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.30 Average bike rentals based on time of the day over years by bike type.png', dpi=300, bbox_inches='tight')
day, year and bike_type columns:¶Columns: day, year, bike_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'bike_type', col_wrap = 2, height = 4.5, aspect = 1.8, hue = 'year')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on day of the month over years by bike type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.31 Average bike rentals based on day of the month over years by bike type.png', dpi=300, bbox_inches='tight')
weekday, year and bike_type columns:¶Columns: weekday, year, bike_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over years:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on day of the week over years by bike type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, rotation = 30, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.32 Average bike rentals based on day of the week over years by bike type.png', dpi=300, bbox_inches='tight')
month, year and bike_type columns:¶Columns: month, year, bike_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on month over the years by bike type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.33 Average bike rentals based on month over the years by bike type.png', dpi=300, bbox_inches='tight')
quarter, year and bike_type columns:¶Columns: quarter, year, bike_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over years:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each quarter in a year
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'bike_type', col_wrap = 2, height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on quarter over the years by bike type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.34 Average bike rentals based on quarter over the years by bike type.png', dpi=300, bbox_inches='tight')
hour, year and pass_type columns:¶Columns: hour, year, pass_typeData type: (numerical, continuous), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour of the year:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1.5, hue = 'year')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on hour of the day over years by pass type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.35 Average bike rentals based on hour of the day over years by pass type.png', dpi=300, bbox_inches='tight')
daytime, year and pass_type columns:¶Columns: daytime, year, pass_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each daytime in a day
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1.2, hue = 'year')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on time of the day over years by pass type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.36 Average bike rentals based on time of the day over years by pass type.png', dpi=300, bbox_inches='tight')
day, year and pass_type columns:¶Columns: day, year, pass_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'pass_type', col_wrap = 2, height = 4.5, aspect = 1.8, hue = 'year')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Average bike rentals based on day of the month over years by pass type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.37 Average bike rentals based on day of the month over years by pass type.png', dpi=300, bbox_inches='tight')
weekday, year and pass_type columns:¶Columns: weekday, year, pass_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over years:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on day of the week over years by pass type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, rotation = 30, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.38 Average bike rentals based on day of the week over years by pass type.png', dpi=300, bbox_inches='tight')
month, year and pass_type columns:¶Columns: month, year, pass_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on month over the years by pass type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.39 Average bike rentals based on month over the years by pass type.png', dpi=300, bbox_inches='tight')
quarter, year and pass_type columns:¶Columns: quarter, year, pass_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over years:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each quarter in a year
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'pass_type', col_wrap = 3, height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on quarter over the years by pass type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.40 Average bike rentals based on quarter over the years by pass type.png', dpi=300, bbox_inches='tight')
hour, year and fare_type columns:¶Columns: hour, year, fare_typeData type: (numerical, continuous), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour of the year:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1.5, hue = 'year')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Average bike rentals based on hour of the day over years by fare type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.41 Average bike rentals based on hour of the day over years by fare type.png', dpi=300, bbox_inches='tight')
daytime, year and fare_type columns:¶Columns: daytime, year, fare_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each daytime in a day
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1.2, hue = 'year')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Average bike rentals based on time of the day over years by fare type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.42 Average bike rentals based on time of the day over years by fare type.png', dpi=300, bbox_inches='tight')
day, year and fare_type columns:¶Columns: day, year, fare_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'fare_type', col_wrap = 2, height = 4.5, aspect = 1.8, hue = 'year')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Average bike rentals based on day of the month over years by fare type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.43 Average bike rentals based on day of the month over years by fare type.png', dpi=300, bbox_inches='tight')
weekday, year and fare_type columns:¶Columns: weekday, year, fare_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over years:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Average bike rentals based on day of the week over years by fare type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, rotation = 30, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.44 Average bike rentals based on day of the week over years by fare type.png', dpi=300, bbox_inches='tight')
month, year and fare_type columns:¶Columns: month, year, fare_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1.2, hue = 'year')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Average bike rentals based on month over the years by fare type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.45 Average bike rentals based on month over the years by fare type.png', dpi=300, bbox_inches='tight')
quarter, year and fare_type columns:¶Columns: quarter, year, fare_typeData type: (Categorical, ordered), (Categorical, ordered) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over years:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each quarter in a year
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#6b8a99', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Seaborn's point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'fare_type', col_wrap = 2, height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None);
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Average bike rentals based on quarter over the years by fare type', fontsize = 16, weight = 'bold')
g.set_titles('{col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
g.set_xticklabels(plot_order, size = 12)
g.set_yticklabels(size = 12)
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. Bike rentals\n', size = 14)
plt.subplots_adjust(wspace=0.05, hspace=0.2);
# add custom legend
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 1, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(1, 1.1));
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.46 Average bike rentals based on quarter over the years by fare type.png', dpi=300, bbox_inches='tight')
hour, year, trip_type and bike_type columns:¶Columns: hour, year, trip_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour over years:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["trip_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'bike_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on hour of the day over years by trip type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.47 Average bike rentals based on hour over the years by trip type and bike type.png', dpi=300, bbox_inches='tight')
daytime, year, trip_type and bike_type columns:¶Columns: daytime, year, trip_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each day in a week
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["trip_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'bike_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on time of the day over years by trip type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.48 Average bike rentals based on daytime over the years by trip type and bike type.png', dpi=300, bbox_inches='tight')
day, year, trip_type and bike_type columns:¶Columns: day, year, trip_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["trip_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'bike_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the month over years by trip type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.49 Average bike rentals based on day over the years by trip type and bike type.png', dpi=300, bbox_inches='tight')
weekday, year, trip_type and bike_type columns:¶Columns: weekday, year, trip_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over years:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["trip_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'bike_type', row = 'trip_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the week over years by trip type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.50 Average bike rentals based on weekday over the years by trip type and bike type.png', dpi=300, bbox_inches='tight')
month, year, trip_type and bike_type columns:¶Columns: month, year, trip_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["trip_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'bike_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on month over years by trip type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
xlabels = ax.get_xticklabels()
for i,l in enumerate(xlabels):
# skip labels
if (i%2 == 0): xlabels[i] = ''
# set new xlabels
ax.set_xticklabels(xlabels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.51 Average bike rentals based on month over the years by trip type and bike type.png', dpi=300, bbox_inches='tight')
quarter, year, trip_type and bike_type columns:¶Columns: quarter, year, trip_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over years:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each day in a week
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["trip_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'bike_type', row = 'trip_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on quarter of individual years by trip type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_xticklabels(size = 12)
g.set_yticklabels(size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.52 Average bike rentals based on quarter of individual years by trip type and bike type.png', dpi=300, bbox_inches='tight')
hour, year, trip_type and pass_type columns:¶Columns: hour, year, trip_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour over years:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["trip_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'pass_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on hour of the day over years by trip type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.53 Average bike rentals based on hour over the years by trip type and pass type.png', dpi=300, bbox_inches='tight')
daytime, year, trip_type and pass_type columns:¶Columns: daytime, year, trip_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each day in a week
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["trip_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'pass_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on time of the day over years by trip type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.54 Average bike rentals based on daytime over the years by trip type and pass type.png', dpi=300, bbox_inches='tight')
day, year, trip_type and pass_type columns:¶Columns: day, year, trip_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["trip_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'pass_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the month over years by trip type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.55 Average bike rentals based on day over the years by trip type and pass type.png', dpi=300, bbox_inches='tight')
weekday, year, trip_type and pass_type columns:¶Columns: weekday, year, trip_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over years:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["trip_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'pass_type', row = 'trip_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the week over years by trip type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.56 Average bike rentals based on weekday over the years by trip type and pass type.png', dpi=300, bbox_inches='tight')
month, year, trip_type and pass_type columns:¶Columns: month, year, trip_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["trip_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'pass_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on month over years by trip type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
xlabels = ax.get_xticklabels()
for i,l in enumerate(xlabels):
# skip labels
if (i%2 == 0): xlabels[i] = ''
# set new xlabels
ax.set_xticklabels(xlabels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.57 Average bike rentals based on month over the years by trip type and pass type.png', dpi=300, bbox_inches='tight')
quarter, year, trip_type and pass_type columns:¶Columns: quarter, year, trip_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over years:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each day in a week
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["trip_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'pass_type', row = 'trip_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on quarter of individual years by trip type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_xticklabels(size = 12)
g.set_yticklabels(size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.58 Average bike rentals based on quarter of individual years by trip type and pass type.png', dpi=300, bbox_inches='tight')
hour, year, trip_type and fare_type columns:¶Columns: hour, year, trip_type, fare_typeData type: (numerical, continuous), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour over years:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["trip_type"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'fare_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on hour of the day over years by trip type and fare type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Fare = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(0.4, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.59 Average bike rentals based on hour over the years by trip type and fare type.png', dpi=300, bbox_inches='tight')
daytime, year, trip_type and fare_type columns:¶Columns: daytime, year, trip_type, fare_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each day in a week
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["trip_type"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'fare_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on time of the day over years by trip type and fare type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Fare = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(0.4, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.60 Average bike rentals based on daytime over the years by trip type and fare type.png', dpi=300, bbox_inches='tight')
day, year, trip_type and fare_type columns:¶Columns: day, year, trip_type, fare_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["trip_type"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'fare_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the month over years by trip type and fare type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Fare = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(0.4, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.61 Average bike rentals based on day over the years by trip type and fare type.png', dpi=300, bbox_inches='tight')
weekday, year, trip_type and fare_type columns:¶Columns: weekday, year, trip_type, fare_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over years:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["trip_type"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'fare_type', row = 'trip_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the week over years by trip type and fare type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Fare = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(0.4, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.62 Average bike rentals based on weekday over the years by trip type and fare type.png', dpi=300, bbox_inches='tight')
month, year, trip_type and fare_type columns:¶Columns: month, year, trip_type, fare_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["trip_type"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'fare_type', row = 'trip_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on month over years by trip type and fare type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Fare = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
xlabels = ax.get_xticklabels()
for i,l in enumerate(xlabels):
# skip labels
if (i%2 == 0): xlabels[i] = ''
# set new xlabels
ax.set_xticklabels(xlabels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(0.4, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.63 Average bike rentals based on month over the years by trip type and fare type.png', dpi=300, bbox_inches='tight')
quarter, year, trip_type and fare_type columns:¶Columns: quarter, year, trip_type, fare_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over years:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each day in a week
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["trip_type"],
bikeshare["fare_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'fare_type', row = 'trip_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on quarter of individual years by trip type and fare type',
fontsize = 16, weight = 'bold')
g.set_titles('Trip = {row_name} | Fare = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_xticklabels(size = 12)
g.set_yticklabels(size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(0.4, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.64 Average bike rentals based on quarter of individual years by trip type and fare type.png', dpi=300, bbox_inches='tight')
hour, year, fare_type and bike_type columns:¶Columns: hour, year, fare_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour over years:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["fare_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'bike_type', row = 'fare_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on hour of the day over years by fare type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.65 Average bike rentals based on hour over the years by fare type and bike type.png', dpi=300, bbox_inches='tight')
daytime, year, fare_type and bike_type columns:¶Columns: daytime, year, fare_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each day in a week
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["fare_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'bike_type', row = 'fare_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on time of the day over years by fare type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.66 Average bike rentals based on daytime over the years by fare type and bike type.png', dpi=300, bbox_inches='tight')
day, year, fare_type and bike_type columns:¶Columns: day, year, fare_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["fare_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'bike_type', row = 'fare_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the month over years by fare type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.67 Average bike rentals based on day over the years by fare type and bike type.png', dpi=300, bbox_inches='tight')
weekday, year, fare_type and bike_type columns:¶Columns: weekday, year, fare_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over years:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["fare_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'bike_type', row = 'fare_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the week over years by fare type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.68 Average bike rentals based on weekday over the years by fare type and bike type.png', dpi=300, bbox_inches='tight')
month, year, fare_type and bike_type columns:¶Columns: month, year, fare_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["fare_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'bike_type', row = 'fare_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on month over years by fare type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
xlabels = ax.get_xticklabels()
for i,l in enumerate(xlabels):
# skip labels
if (i%2 == 0): xlabels[i] = ''
# set new xlabels
ax.set_xticklabels(xlabels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.69 Average bike rentals based on month over the years by fare type and bike type.png', dpi=300, bbox_inches='tight')
quarter, year, fare_type and bike_type columns:¶Columns: quarter, year, fare_type, bike_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over years:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each day in a week
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["fare_type"],
bikeshare["bike_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'bike_type', row = 'fare_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on quarter of individual years by fare type and bike type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Bike = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_xticklabels(size = 12)
g.set_yticklabels(size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-0.6, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.70 Average bike rentals based on quarter of individual years by fare type and bike type.png', dpi=300, bbox_inches='tight')
hour, year, fare_type and pass_type columns:¶Columns: hour, year, fare_type, pass_typeData type: (numerical, continuous), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour over years:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["fare_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'pass_type', row = 'fare_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on hour of the day over years by fare type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.71 Average bike rentals based on hour over the years by fare type and pass type.png', dpi=300, bbox_inches='tight')
daytime, year, fare_type and pass_type columns:¶Columns: daytime, year, fare_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each day in a week
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["fare_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'pass_type', row = 'fare_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on time of the day over years by fare type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.72 Average bike rentals based on daytime over the years by fare type and pass type.png', dpi=300, bbox_inches='tight')
day, year, fare_type and pass_type columns:¶Columns: day, year, fare_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["fare_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'pass_type', row = 'fare_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the month over years by fare type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.73 Average bike rentals based on day over the years by fare type and pass type.png', dpi=300, bbox_inches='tight')
weekday, year, fare_type and pass_type columns:¶Columns: weekday, year, fare_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over years:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["fare_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'pass_type', row = 'fare_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on day of the week over years by fare type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.74 Average bike rentals based on weekday over the years by fare type and pass type.png', dpi=300, bbox_inches='tight')
month, year, fare_type and pass_type columns:¶Columns: month, year, fare_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["fare_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'pass_type', row = 'fare_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on month over years by fare type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
xlabels = ax.get_xticklabels()
for i,l in enumerate(xlabels):
# skip labels
if (i%2 == 0): xlabels[i] = ''
# set new xlabels
ax.set_xticklabels(xlabels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.75 Average bike rentals based on month over the years by fare type and pass type.png', dpi=300, bbox_inches='tight')
quarter, year, fare_type and pass_type columns:¶Columns: quarter, year, fare_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over years:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each day in a week
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["fare_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'pass_type', row = 'fare_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.75)
g.fig.suptitle('Average bike rentals based on quarter of individual years by fare type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Fare = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_xticklabels(size = 12)
g.set_yticklabels(size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 2.9));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.76 Average bike rentals based on quarter of individual years by fare type and pass type.png', dpi=300, bbox_inches='tight')
hour, year, bike_type and pass_type columns:¶Columns: hour, year, bike_type, pass_typeData type: (numerical, continuous), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour over years:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on hour of the day over years by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.77 Average bike rentals based on hour over the years by bike type and pass type.png', dpi=300, bbox_inches='tight')
daytime, year, bike_type and pass_type columns:¶Columns: daytime, year, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over years:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each day in a week
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on time of the day over years by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.78 Average bike rentals based on daytime over the years by bike type and pass type.png', dpi=300, bbox_inches='tight')
day, year, bike_type and pass_type columns:¶Columns: day, year, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over years:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on day of the month over years by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.79 Average bike rentals based on day over the years by bike type and pass type.png', dpi=300, bbox_inches='tight')
weekday, year, bike_type and pass_type columns:¶Columns: weekday, year, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over years:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'pass_type', row = 'bike_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on day of the week over years by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.80 Average bike rentals based on weekday over the years by bike type and pass type.png', dpi=300, bbox_inches='tight')
month, year, bike_type and pass_type columns:¶Columns: month, year, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over years:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'year')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on month over years by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
xlabels = ax.get_xticklabels()
for i,l in enumerate(xlabels):
# skip labels
if (i%2 == 0): xlabels[i] = ''
# set new xlabels
ax.set_xticklabels(xlabels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.81 Average bike rentals based on month over the years by bike type and pass type.png', dpi=300, bbox_inches='tight')
quarter, year, bike_type and pass_type columns:¶Columns: quarter, year, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over years:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each day in a week
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#cc3f9d', '#31404a', '#60b6f0']
sb.set_palette(flatui, n_colors=3, desat=0.8)
# Facet grid with point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'pass_type', row = 'bike_type', height = 4.5, aspect = 1, hue = 'year')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on quarter of individual years by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_xticklabels(size = 12)
g.set_yticklabels(size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[2], linestyle='-', linewidth = 2)]
plt.legend(custom, ['2017', '2018', '2019'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Year', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.82 Average bike rentals based on quarter of individual years by bike type and pass type.png', dpi=300, bbox_inches='tight')
hour, trip_type, bike_type and pass_type columns:¶Columns: hour, trip_type, bike_type, pass_typeData type: (numerical, continuous), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour over trip_type:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["trip_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#e36297', '#42d4be']
sb.set_palette(flatui, n_colors=2, desat=0.8)
# Facet grid with point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on hour of the day over trip type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['One Way', 'Round Trip'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.83 Average bike rentals based on hour over trip type by bike type and pass type.png', dpi=300, bbox_inches='tight')
daytime, trip_type, bike_type and pass_type columns:¶Columns: daytime, trip_type, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over trip_type:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each day in a week
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["trip_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#e36297', '#42d4be']
sb.set_palette(flatui, n_colors=2, desat=0.8)
# Facet grid with point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on time of the day over trip type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['One Way', 'Round Trip'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.84 Average bike rentals based on daytime over the trip type by bike type and pass type.png', dpi=300, bbox_inches='tight')
day, trip_type, bike_type and pass_type columns:¶Columns: day, trip_type, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over trip_type:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["trip_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#e36297', '#42d4be']
sb.set_palette(flatui, n_colors=2, desat=0.8)
# Facet grid with point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on day of the month over trip type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['One Way', 'Round Trip'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.85 Average bike rentals based on day over trip type by bike type and pass type.png', dpi=300, bbox_inches='tight')
weekday, trip_type, bike_type and pass_type columns:¶Columns: weekday, trip_type, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over trip_type:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["trip_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#e36297', '#42d4be']
sb.set_palette(flatui, n_colors=2, desat=0.8)
# Facet grid with point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'pass_type', row = 'bike_type', height = 4.5, aspect = 1, hue = 'trip_type')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on day of the week over trip type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['One Way', 'Round Trip'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.86 Average bike rentals based on weekday over trip type by bike type and pass type.png', dpi=300, bbox_inches='tight')
month, trip_type, bike_type and pass_type columns:¶Columns: month, trip_type, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over trip_type:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["trip_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#e36297', '#42d4be']
sb.set_palette(flatui, n_colors=2, desat=0.8)
# Facet grid with point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'trip_type')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on month over trip type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
xlabels = ax.get_xticklabels()
for i,l in enumerate(xlabels):
# skip labels
if (i%2 == 0): xlabels[i] = ''
# set new xlabels
ax.set_xticklabels(xlabels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['One Way', 'Round Trip'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.87 Average bike rentals based on month over trip type by bike type and pass type.png', dpi=300, bbox_inches='tight')
quarter, trip_type, bike_type and pass_type columns:¶Columns: quarter, trip_type, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over trip_type:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each day in a week
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["trip_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#e36297', '#42d4be']
sb.set_palette(flatui, n_colors=2, desat=0.8)
# Facet grid with point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'pass_type', row = 'bike_type', height = 4.5, aspect = 1, hue = 'trip_type')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on quarter of year over trip type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_xticklabels(size = 12)
g.set_yticklabels(size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['One Way', 'Round Trip'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Trip type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.88 Average bike rentals based on quarter of year over trip type by bike type and pass type.png', dpi=300, bbox_inches='tight')
hour, fare_type, bike_type and pass_type columns:¶Columns: hour, fare_type, bike_type, pass_typeData type: (numerical, continuous), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the hour over fare_type:
Create a dataset which contain bike rentals relative to each hour in the day over respective months in the year. Care should be taken as to inlcude only hours that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every hour in any day.
# create a dataset for bike rentals over each hour in a day
hours_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["hour"],
bikeshare["fare_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
hours_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# Facet grid with point plot
plot_order = hours_df.hour.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = hours_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'fare_type')
g.map(sb.pointplot, "hour", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on hour of the day over fare type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nHour of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['Basic', 'Extended'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.89 Average bike rentals based on hour over fare type by bike type and pass type.png', dpi=300, bbox_inches='tight')
daytime, fare_type, bike_type and pass_type columns:¶Columns: daytime, fare_type, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the daytime of the day over fare_type:
Create a dataset which contain bike rentals relative to each daytime in the day over respective months in the year. Care should be taken as to inlcude only daytimes that appear in every day of the month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every daytime in any day.
# create a dataset for bike rentals over each day in a week
daytime_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["daytime"],
bikeshare["fare_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
daytime_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# Facet grid with point plot
plot_order = daytime_df.daytime.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = daytime_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'fare_type')
g.map(sb.pointplot, "daytime", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on time of the day over fare type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nTime of the day', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['Basic', 'Extended'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.90 Average bike rentals based on daytime over the fare type by bike type and pass type.png', dpi=300, bbox_inches='tight')
day, fare_type, bike_type and pass_type columns:¶Columns: day, fare_type, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the month over fare_type:
Create a dataset which contain bike rentals relative to each day over respective months in the year. Care should be taken as to inlcude only days that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any month.
# create a dataset for bike rentals over each day in a month
days_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["day"],
bikeshare["fare_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
days_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# Facet grid with point plot
plot_order = days_df.day.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = days_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'fare_type')
g.map(sb.pointplot, "day", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on day of the month over fare type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the month', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
# skip labels
if not (i%5 == 0): labels[i] = ''
# set new labels
ax.set_xticklabels(labels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['Basic', 'Extended'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.91 Average bike rentals based on day over fare type by bike type and pass type.png', dpi=300, bbox_inches='tight')
weekday, fare_type, bike_type and pass_type columns:¶Columns: weekday, fare_type, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the day of the week over fare_type:
Create a dataset which contain bike rentals relative to each day over respective week in the month. Care should be taken as to inlcude only days that appear in every week over individual months. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every day in any week.
# create a dataset for bike rentals over each day in a week
weekday_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["week"],
bikeshare["weekday"],
bikeshare["fare_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
weekday_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# Facet grid with point plot
plot_order = weekday_df.weekday.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = weekday_df, col = 'pass_type', row = 'bike_type', height = 4.5, aspect = 1, hue = 'fare_type')
g.map(sb.pointplot, "weekday", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on day of the week over fare type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nDay of the week', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
labels = ax.get_xticklabels()
# set new labels
ax.set_xticklabels(labels, rotation = 30, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['Basic', 'Extended'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.92 Average bike rentals based on weekday over fare type by bike type and pass type.png', dpi=300, bbox_inches='tight')
month, fare_type, bike_type and pass_type columns:¶Columns: month, fare_type, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the month over fare_type:
Create a dataset which contain bike rentals relative to each month over respective years. Care should be taken as to inlcude only rentals that appear in every month over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every month in any year.
# create a dataset for bike rentals over each month in a year
month_df = bikeshare.groupby([bikeshare["year"],
bikeshare["month"],
bikeshare["fare_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
month_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# Facet grid with point plot
plot_order = month_df.month.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = month_df, col = 'pass_type', row = 'bike_type', height = 4, aspect = 1, hue = 'fare_type')
g.map(sb.pointplot, "month", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on month over fare type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nMonth of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_yticklabels(size = 12)
# iterate over axes of FacetGrid
for ax in g.axes.flat:
# get x labels
xlabels = ax.get_xticklabels()
for i,l in enumerate(xlabels):
# skip labels
if (i%2 == 0): xlabels[i] = ''
# set new xlabels
ax.set_xticklabels(xlabels, size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['Basic', 'Extended'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.93 Average bike rentals based on month over fare type by bike type and pass type.png', dpi=300, bbox_inches='tight')
quarter, fare_type, bike_type and pass_type columns:¶Columns: quarter, fare_type, bike_type, pass_typeData type: (Categorical, ordered), (Categorical, ordered), (Categorical, nominal) and (Categorical, nominal)Plot: Point plotFind average rentals based on the quarter over fare_type:
Create a dataset which contain bike rentals relative to each quarter over respective years. Care should be taken as to inlcude only rentals that appear in every quarter over individual years. Use available categorical combinations and do not fill the NULL values with numerical zero's so as to consider bike rentals subjected to every quarter in any year.
# create a dataset for bike rentals over each day in a week
quarter_df = bikeshare.groupby([bikeshare["year"],
bikeshare["quarter"],
bikeshare["fare_type"],
bikeshare["bike_type"],
bikeshare["pass_type"]]).count()['trip_id'].reset_index(name='rentals')
quarter_df.head(10)
Point plot:
# Assign palette as per requirement
sb.set_style('white')
flatui = ['#466887', '#eda668']
sb.set_palette(flatui, n_colors=2, desat=0.6)
# Facet grid with point plot
plot_order = quarter_df.quarter.sort_values(ascending=True).unique()
g = sb.FacetGrid(data = quarter_df, col = 'pass_type', row = 'bike_type', height = 4.5, aspect = 1, hue = 'fare_type')
g.map(sb.pointplot, "quarter", "rentals", order= plot_order, linestyles = "-", ci = None, markers = ['.']);
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle('Average bike rentals based on quarter of year over fare type by bike type and pass type',
fontsize = 16, weight = 'bold')
g.set_titles('Bike = {row_name} | Pass = {col_name}', weight = 'bold', size = 14, color = 'dimgrey')
# improve plot aesthetics
# -------------------------------------------------------
g.set_xlabels('\nQuarter of the year', size = 14)
g.set_ylabels('Avg. bike rentals\n', size = 14)
g.set_xticklabels(size = 12)
g.set_yticklabels(size = 12)
# -------------------------------------------------------
# add custom legend
# -------------------------------------------------------
custom = [Line2D([], [], marker='.', color=sb.color_palette()[0], linestyle='-', linewidth = 2),
Line2D([], [], marker='.', color=sb.color_palette()[1], linestyle='-', linewidth = 2)]
plt.legend(custom, ['Basic', 'Extended'], scatterpoints=1, frameon=True, fancybox=True,
shadow=False, framealpha = 1, borderpad=1, borderaxespad=1, labelspacing=0.5,
ncol = 3, title='Fare type', title_fontsize=12, fontsize=10, facecolor='white',
markerfirst=True, handlelength=2, handletextpad=0.5, bbox_to_anchor=(-1.15, 5.5));
# -------------------------------------------------------
plt.subplots_adjust(wspace=0.05, hspace=0.3);
# savefig by passing (bbox_inches='tight'),which will adjust the figure to include all of the x and y labels
plt.savefig('plots/3.3.94 Average bike rentals based on quarter of year over fare type by bike type and pass type.png', dpi=300, bbox_inches='tight')
|
|